httpz- Hyper-fast HTTP Scraping Tool |
git clone git://git.acid.vegas/httpz.git |
Log | Files | Refs | Archive | README | LICENSE |
parsers.py (6516B)
1 #!/usr/bin/env python3 2 # HTTPZ Web Scanner - Developed by acidvegas in Python (https://github.com/acidvegas/httpz) 3 # httpz_scanner/parsers.py 4 5 import argparse 6 7 try: 8 import bs4 9 except ImportError: 10 raise ImportError('missing bs4 module (pip install beautifulsoup4)') 11 12 try: 13 from cryptography import x509 14 from cryptography.hazmat.primitives import hashes 15 from cryptography.x509.oid import NameOID 16 except ImportError: 17 raise ImportError('missing cryptography module (pip install cryptography)') 18 19 try: 20 import mmh3 21 except ImportError: 22 raise ImportError('missing mmh3 module (pip install mmh3)') 23 24 from .utils import debug, error 25 26 27 def parse_domain_url(domain: str) -> tuple: 28 ''' 29 Parse domain string into base domain, port, and protocol list 30 31 :param domain: Raw domain string to parse 32 ''' 33 34 port = None 35 base_domain = domain.rstrip('/') 36 37 if base_domain.startswith(('http://', 'https://')): 38 protocol = 'https://' if base_domain.startswith('https://') else 'http://' 39 base_domain = base_domain.split('://', 1)[1] 40 if ':' in base_domain.split('/')[0]: 41 base_domain, port_str = base_domain.split(':', 1) 42 try: 43 port = int(port_str.split('/')[0]) 44 except ValueError: 45 port = None 46 else: 47 if ':' in base_domain.split('/')[0]: 48 base_domain, port_str = base_domain.split(':', 1) 49 port = int(port_str.split('/')[0]) if port_str.split('/')[0].isdigit() else None 50 51 protocols = ['http://', 'https://'] # Always try HTTP first 52 53 return base_domain, port, protocols 54 55 56 async def get_cert_info(ssl_object, url: str) -> dict: 57 ''' 58 Get SSL certificate information for a domain 59 60 :param ssl_object: SSL object to get certificate info from 61 :param url: URL to get certificate info from 62 ''' 63 64 try: 65 if not ssl_object or not (cert_der := ssl_object.getpeercert(binary_form=True)): 66 return None 67 68 cert = x509.load_der_x509_certificate(cert_der) 69 70 try: 71 san_extension = cert.extensions.get_extension_for_oid(x509.oid.ExtensionOID.SUBJECT_ALTERNATIVE_NAME) 72 alt_names = [name.value for name in san_extension.value] if san_extension else [] 73 except x509.extensions.ExtensionNotFound: 74 alt_names = [] 75 76 try: 77 common_name = cert.subject.get_attributes_for_oid(NameOID.COMMON_NAME)[0].value 78 except IndexError: 79 common_name = None 80 81 try: 82 issuer = cert.issuer.get_attributes_for_oid(NameOID.COMMON_NAME)[0].value 83 except IndexError: 84 issuer = None 85 86 return { 87 'fingerprint' : cert.fingerprint(hashes.SHA256()).hex(), 88 'common_name' : common_name, 89 'issuer' : issuer, 90 'alt_names' : alt_names, 91 'not_before' : cert.not_valid_before_utc.isoformat(), 92 'not_after' : cert.not_valid_after_utc.isoformat(), 93 'version' : cert.version.value, 94 'serial_number' : format(cert.serial_number, 'x'), 95 } 96 except Exception as e: 97 error(f'Error getting cert info for {url}: {str(e)}') 98 return None 99 100 101 async def get_favicon_hash(session, base_url: str, html: str) -> str: 102 ''' 103 Get favicon hash from a webpage 104 105 :param session: aiohttp client session 106 :param base_url: base URL of the website 107 :param html: HTML content of the page 108 ''' 109 110 try: 111 soup = bs4.BeautifulSoup(html, 'html.parser') 112 113 favicon_url = None 114 for link in soup.find_all('link'): 115 if link.get('rel') and any(x.lower() == 'icon' for x in link.get('rel')): 116 favicon_url = link.get('href') 117 break 118 119 if not favicon_url: 120 favicon_url = '/favicon.ico' 121 122 if favicon_url.startswith('//'): 123 favicon_url = 'https:' + favicon_url 124 elif favicon_url.startswith('/'): 125 favicon_url = base_url + favicon_url 126 elif not favicon_url.startswith(('http://', 'https://')): 127 favicon_url = base_url + '/' + favicon_url 128 129 async with session.get(favicon_url, timeout=10) as response: 130 if response.status == 200: 131 content = (await response.read())[:1024*1024] 132 hash_value = mmh3.hash64(content)[0] 133 if hash_value != 0: 134 return str(hash_value) 135 136 except Exception as e: 137 debug(f'Error getting favicon for {base_url}: {str(e)}') 138 139 return None 140 141 142 def parse_status_codes(codes_str: str) -> set: 143 ''' 144 Parse comma-separated status codes and ranges into a set of integers 145 146 :param codes_str: Comma-separated status codes (e.g., "200,301-399,404,500-503") 147 ''' 148 149 codes = set() 150 try: 151 for part in codes_str.split(','): 152 if '-' in part: 153 start, end = map(int, part.split('-')) 154 codes.update(range(start, end + 1)) 155 else: 156 codes.add(int(part)) 157 return codes 158 except ValueError: 159 raise argparse.ArgumentTypeError('Invalid status code format. Use comma-separated numbers or ranges (e.g., 200,301-399,404,500-503)') 160 161 162 def parse_shard(shard_str: str) -> tuple: 163 ''' 164 Parse shard argument in format INDEX/TOTAL 165 166 :param shard_str: Shard string in format "INDEX/TOTAL" 167 ''' 168 169 try: 170 shard_index, total_shards = map(int, shard_str.split('/')) 171 if shard_index < 1 or total_shards < 1 or shard_index > total_shards: 172 raise ValueError 173 return shard_index - 1, total_shards # Convert to 0-based index 174 except (ValueError, TypeError): 175 raise argparse.ArgumentTypeError('Shard must be in format INDEX/TOTAL where INDEX <= TOTAL') 176 177 178 def parse_title(html: str, content_type: str = None) -> str: 179 ''' 180 Parse title from HTML content 181 182 :param html: HTML content of the page 183 :param content_type: Content-Type header value 184 ''' 185 186 # Only parse title for HTML content 187 if content_type and not any(x in content_type.lower() for x in ['text/html', 'application/xhtml']): 188 return None 189 190 try: 191 soup = bs4.BeautifulSoup(html, 'html.parser', from_encoding='utf-8', features='lxml') 192 if title := soup.title: 193 return title.string.strip() 194 except: 195 pass 196 197 return None