httpz

- Hyper-fast HTTP Scraping Tool
git clone git://git.acid.vegas/httpz.git
commit a006a1dac4c41fa47a6ddc82ff6813bde774d498
parent 2698bb5bc0d9d3fc33be85c57e9b915cca3c523f
Author: acidvegas <acid.vegas@acid.vegas>
Date: Tue, 11 Feb 2025 00:03:28 -0500

Code cleanup

Diffstat:
httpz.py
150
++++++++++++++++++++++++++++++++-----------------------------------------------
1 file changed, 60 insertions(+), 90 deletions(-)
diff --git a/httpz.py b/httpz.py
@@ -69,15 +69,15 @@ class Colors:
 	GRAY       = '\033[90m'       # Gray color
 	CYAN       = '\033[96m'       # Cyan color
 
-
-_SILENT_MODE = False
+# Global for silent mode
+SILENT_MODE = False
 
 def debug(msg: str): 
-	if not _SILENT_MODE: logging.debug(msg)
+	if not SILENT_MODE: logging.debug(msg)
 def error(msg: str):
-	if not _SILENT_MODE: logging.error(msg)
+	if not SILENT_MODE: logging.error(msg)
 def info(msg: str):
-	if not _SILENT_MODE: logging.info(msg)
+	if not SILENT_MODE: logging.info(msg)
 
 
 async def get_cert_info(ssl_object, url: str) -> dict:
@@ -130,7 +130,7 @@ async def get_cert_info(ssl_object, url: str) -> dict:
 			'serial_number' : format(cert.serial_number, 'x'),
 		}
 	except Exception as e:
-		debug(f'Error getting cert info for {url}: {str(e)}')
+		error(f'Error getting cert info for {url}: {str(e)}')
 		return None
 
 
@@ -239,7 +239,7 @@ async def load_resolvers(resolver_file: str = None) -> list:
 		async with aiohttp.ClientSession() as session:
 			async with session.get('https://raw.githubusercontent.com/trickest/resolvers/refs/heads/main/resolvers.txt') as response:
 				resolvers = await response.text()
-				if not _SILENT_MODE:
+				if not SILENT_MODE:
 					info(f'Loaded {len(resolvers.splitlines()):,} resolvers.')
 				return [resolver.strip() for resolver in resolvers.splitlines()]
 
@@ -302,29 +302,20 @@ async def resolve_all_dns(domain: str, timeout: int = 5, nameserver: str = None,
 	return sorted(set(ips)), cname, nameservers, ns_ips
 
 
-async def check_domain(session: aiohttp.ClientSession, domain: str, follow_redirects: bool = False, timeout: int = 5, check_axfr: bool = False, resolvers: list = None) -> dict:
+def parse_domain_url(domain: str) -> tuple:
 	'''
-	Check a single domain for its status code, title, and body preview
+	Parse domain string into base domain, port, and protocol list
 	
-	:param session: aiohttp client session
-	:param domain: domain to check
-	:param follow_redirects: whether to follow redirects
-	:param timeout: timeout in seconds
-	:param check_axfr: whether to check for AXFR
-	:param resolvers: list of DNS resolvers to use
+	:param domain: Raw domain string to parse
+	:return: Tuple of (base_domain, port, protocols)
 	'''
-	# Pick random resolver for this domain
-	nameserver = random.choice(resolvers) if resolvers else None
 
-	# Parse domain and port
 	port = None
 	base_domain = domain.rstrip('/')
 	
-	# Handle URLs with existing protocol
 	if base_domain.startswith(('http://', 'https://')):
 		protocol = 'https://' if base_domain.startswith('https://') else 'http://'
 		base_domain = base_domain.split('://', 1)[1]
-		# Try to extract port from domain
 		if ':' in base_domain.split('/')[0]:
 			base_domain, port_str = base_domain.split(':', 1)
 			try:
@@ -335,22 +326,34 @@ async def check_domain(session: aiohttp.ClientSession, domain: str, follow_redir
 			port = 443 if protocol == 'https://' else 80
 		protocols = [f'{protocol}{base_domain}{":" + str(port) if port else ""}']
 	else:
-		# No protocol specified - try HTTPS first, then HTTP
 		if ':' in base_domain.split('/')[0]:
 			base_domain, port_str = base_domain.split(':', 1)
-			try:
-				port = int(port_str.split('/')[0])
-			except ValueError:
-				port = 443  # Default to HTTPS port
+			port = int(port_str.split('/')[0]) if port_str.split('/')[0].isdigit() else 443
 		else:
-			port = 443  # Default to HTTPS port when no port specified
+			port = 443
 		protocols = [
 			f'https://{base_domain}{":" + str(port) if port else ""}',
 			f'http://{base_domain}{":" + str(port) if port else ""}'
 		]
+	
+	return base_domain, port, protocols
 
-	result = {}  # Start with empty dict
-	base_result = {
+
+async def check_domain(session: aiohttp.ClientSession, domain: str, follow_redirects: bool = False, timeout: int = 5, check_axfr: bool = False, resolvers: list = None) -> dict:
+	'''
+	Check a single domain for its status code, title, and body preview
+	
+	:param session: aiohttp client session
+	:param domain: domain to check
+	:param follow_redirects: whether to follow redirects
+	:param timeout: timeout in seconds
+	:param check_axfr: whether to check for AXFR
+	:param resolvers: list of DNS resolvers to use
+	'''
+	nameserver = random.choice(resolvers) if resolvers else None
+	base_domain, port, protocols = parse_domain_url(domain)
+	
+	result = {
 		'domain'         : base_domain,
 		'status'         : 0,
 		'title'          : None,
@@ -367,37 +370,26 @@ async def check_domain(session: aiohttp.ClientSession, domain: str, follow_redir
 		'redirect_chain' : [],
 		'tls'            : None
 	}
-	result.update(base_result)  # Update result with base fields
-
-	# Do all DNS lookups at once
-	ips, cname, nameservers, ns_ips = await resolve_all_dns(base_domain, timeout, nameserver, check_axfr)
-
-	result['ips']         = ips
-	result['cname']       = cname
-	result['nameservers'] = nameservers
 
+	# Do DNS lookups
+	result['ips'], result['cname'], result['nameservers'], _ = await resolve_all_dns(base_domain, timeout, nameserver, check_axfr)
 
+	# Try each protocol
 	for url in protocols:
 		try:
-			max_redirects = 10 if follow_redirects else 0
-			async with session.get(url, timeout=timeout, allow_redirects=follow_redirects, max_redirects=max_redirects) as response:
-				result['status']         = response.status
-				result['url']            = str(response.url)
-				result['headers']        = dict(response.headers)
-				result['content_type']   = response.headers.get('content-type', '').split(';')[0]
-				result['content_length'] = response.headers.get('content-length')
-				
-				# Track redirect chain
-				if follow_redirects:
-					result['redirect_chain'] = [str(h.url) for h in response.history]
-					if result['redirect_chain']:
-						result['redirect_chain'].append(str(response.url))
-
-				# Try to get cert info for any successful HTTPS connection
+			async with session.get(url, timeout=timeout, allow_redirects=follow_redirects, max_redirects=10 if follow_redirects else 0) as response:
+				result.update({
+					'status'         : response.status,
+					'url'            : str(response.url),
+					'headers'        : dict(response.headers),
+					'content_type'   : response.headers.get('content-type', '').split(';')[0],
+					'content_length' : response.headers.get('content-length'),
+					'redirect_chain' : [str(h.url) for h in response.history] + [str(response.url)] if follow_redirects and response.history else []
+				})
+
 				if response.url.scheme == 'https':
 					try:
-						ssl_object = response._protocol.transport.get_extra_info('ssl_object')
-						if ssl_object:  # Only get cert info if we have a valid SSL object
+						if ssl_object := response._protocol.transport.get_extra_info('ssl_object'):
 							result['tls'] = await get_cert_info(ssl_object, str(response.url))
 					except AttributeError:
 						debug(f'Failed to get SSL info for {url}')
@@ -405,27 +397,21 @@ async def check_domain(session: aiohttp.ClientSession, domain: str, follow_redir
 				if response.status == 200:
 					html = (await response.text())[:1024*1024]
 					soup = bs4.BeautifulSoup(html, 'html.parser')
-					if soup.title:
-						title = ' '.join(soup.title.string.strip().split()).rstrip('.') if soup.title.string else ''
-						result['title'] = title[:300]
-					if soup.get_text():
-						body = ' '.join(soup.get_text().split()).rstrip('.')
-						result['body'] = body[:500]
-					result['favicon_hash'] = await get_favicon_hash(session, url, html)
+					result.update({
+						'title'        : ' '.join(soup.title.string.strip().split()).rstrip('.')[:300] if soup.title and soup.title.string else None,
+						'body'         : ' '.join(soup.get_text().split()).rstrip('.')[:500] if soup.get_text() else None,
+						'favicon_hash' : await get_favicon_hash(session, url, html)
+					})
 					break
 		except Exception as e:
 			debug(f'Error checking {url}: {str(e)}')
 			result['status'] = -1
 			continue
 
-	# Make absolutely sure port is in the result before returning
-	if 'port' not in result:
-		result['port'] = port
-
 	return result
 
 
-def format_status_output(result: dict, debug: bool = False, show_fields: dict = None, match_codes: set = None, exclude_codes: set = None) -> str:
+def format_console_output(result: dict, debug: bool = False, show_fields: dict = None, match_codes: set = None, exclude_codes: set = None) -> str:
 	'''
 	Format the output with colored sections
 	
@@ -568,32 +554,24 @@ async def process_domains(input_source: str = None, debug: bool = False, concurr
 	:param resolver_file: Path to file containing DNS resolvers
 	'''
 
+	# Check if input file exists
 	if input_source and input_source != '-' and not os.path.exists(input_source):
 		raise FileNotFoundError(f'Domain file not found: {input_source}')
 
-	# Clear the output file if specified
-	if output_file:
-		open(output_file, 'w').close()
-
+	# Initialize tasks and processed domains
 	tasks             = set()
-	processed_domains = 0  # Simple counter for all processed domains
+	processed_domains = 0
 	
 	# Load resolvers - await the coroutine
 	resolvers = await load_resolvers(resolver_file)
 
-
 	async def write_result(result: dict):
-
 		'''Write a single result to the output file'''
+
 		nonlocal processed_domains
 		
 		# Create JSON output dict with required fields
-		output_dict = {
-			'url'    : result['url'],
-			'domain' : result['domain'],
-			'status' : result['status'],
-			'port'   : result['port']
-		}
+		output_dict = {'url': result['url'], 'domain': result['domain'], 'status': result['status'], 'port': result['port']}
 		
 		# Add optional fields if they exist
 		if result['title']:
@@ -616,7 +594,8 @@ async def process_domains(input_source: str = None, debug: bool = False, concurr
 			output_dict['nameservers'] = result['nameservers']
 
 		# Get formatted output based on filters
-		formatted = format_status_output(result, debug, show_fields, match_codes, exclude_codes)
+		formatted = format_console_output(result, debug, show_fields, match_codes, exclude_codes)
+		
 		if formatted:
 			# Write to file if specified
 			if output_file:
@@ -665,7 +644,7 @@ async def process_domains(input_source: str = None, debug: bool = False, concurr
 def main():
 	'''Main function to handle command line arguments and run the domain checker'''
 
-	global _SILENT_MODE
+	global SILENT_MODE
 	
 	# Setup argument parser
 	parser = argparse.ArgumentParser(description=f'{Colors.GREEN}Hyper-fast HTTP Scraping Tool{Colors.RESET}', formatter_class=argparse.RawDescriptionHelpFormatter)
@@ -698,17 +677,11 @@ def main():
 	parser.add_argument('-p',  '--progress', action='store_true', help='Show progress counter')
 	parser.add_argument('-r',  '--resolvers', help='File containing DNS resolvers (one per line)')
 	parser.add_argument('-to', '--timeout', type=int, default=5, help='Request timeout in seconds')
-
 	
 	# Parse arguments
 	args = parser.parse_args()
 
-	# Set silent mode based on jsonl argument
-	_SILENT_MODE = args.jsonl
-
-	# Only setup logging if we're not in silent mode
-	if not _SILENT_MODE:
-
+	if not (SILENT_MODE := args.jsonl):
 		# Setup logging
 		if args.debug:
 			apv.setup_logging(level='DEBUG', log_to_disk=True, log_file_name='havoc', show_details=True)
@@ -716,8 +689,6 @@ def main():
 		else:
 			apv.setup_logging(level='INFO')
 
-		logging.info('Starting domain checker...')
-
 		if args.file == '-':
 			logging.info('Reading domains from stdin')
 		else:
@@ -743,7 +714,6 @@ def main():
 		show_fields = {k: True for k in show_fields}
 
 	try:
-		# Run the domain checker
 		asyncio.run(process_domains(args.file, args.debug, args.concurrent, show_fields, args.output, args.jsonl, args.timeout, args.match_codes, args.exclude_codes, args.progress, check_axfr=args.axfr, resolver_file=args.resolvers))
 	except KeyboardInterrupt:
 		logging.warning('Process interrupted by user')