httpz

- Hyper-fast HTTP Scraping Tool
git clone git://git.acid.vegas/httpz.git
Log | Files | Refs | Archive | README | LICENSE

parsers.py (6516B)

      1 #!/usr/bin/env python3
      2 # HTTPZ Web Scanner - Developed by acidvegas in Python (https://github.com/acidvegas/httpz)
      3 # httpz_scanner/parsers.py
      4 
      5 import argparse
      6 
      7 try:
      8     import bs4
      9 except ImportError:
     10     raise ImportError('missing bs4 module (pip install beautifulsoup4)')
     11 
     12 try:
     13     from cryptography                   import x509
     14     from cryptography.hazmat.primitives import hashes
     15     from cryptography.x509.oid          import NameOID
     16 except ImportError:
     17     raise ImportError('missing cryptography module (pip install cryptography)')
     18 
     19 try:
     20     import mmh3
     21 except ImportError:
     22     raise ImportError('missing mmh3 module (pip install mmh3)')
     23 
     24 from .utils import debug, error
     25 
     26 
     27 def parse_domain_url(domain: str) -> tuple:
     28     '''
     29     Parse domain string into base domain, port, and protocol list
     30     
     31     :param domain: Raw domain string to parse
     32     '''
     33 
     34     port = None
     35     base_domain = domain.rstrip('/')
     36     
     37     if base_domain.startswith(('http://', 'https://')):
     38         protocol = 'https://' if base_domain.startswith('https://') else 'http://'
     39         base_domain = base_domain.split('://', 1)[1]
     40         if ':' in base_domain.split('/')[0]:
     41             base_domain, port_str = base_domain.split(':', 1)
     42             try:
     43                 port = int(port_str.split('/')[0])
     44             except ValueError:
     45                 port = None
     46     else:
     47         if ':' in base_domain.split('/')[0]:
     48             base_domain, port_str = base_domain.split(':', 1)
     49             port = int(port_str.split('/')[0]) if port_str.split('/')[0].isdigit() else None
     50     
     51     protocols = ['http://', 'https://']  # Always try HTTP first
     52     
     53     return base_domain, port, protocols
     54 
     55 
     56 async def get_cert_info(ssl_object, url: str) -> dict:
     57     '''
     58     Get SSL certificate information for a domain
     59     
     60     :param ssl_object: SSL object to get certificate info from
     61     :param url: URL to get certificate info from
     62     '''
     63 
     64     try:            
     65         if not ssl_object or not (cert_der := ssl_object.getpeercert(binary_form=True)):
     66             return None
     67 
     68         cert = x509.load_der_x509_certificate(cert_der)
     69 
     70         try:
     71             san_extension = cert.extensions.get_extension_for_oid(x509.oid.ExtensionOID.SUBJECT_ALTERNATIVE_NAME)
     72             alt_names     = [name.value for name in san_extension.value] if san_extension else []
     73         except x509.extensions.ExtensionNotFound:
     74             alt_names = []
     75 
     76         try:
     77             common_name = cert.subject.get_attributes_for_oid(NameOID.COMMON_NAME)[0].value
     78         except IndexError:
     79             common_name = None
     80 
     81         try:
     82             issuer = cert.issuer.get_attributes_for_oid(NameOID.COMMON_NAME)[0].value
     83         except IndexError:
     84             issuer = None
     85 
     86         return {
     87             'fingerprint'   : cert.fingerprint(hashes.SHA256()).hex(),
     88             'common_name'   : common_name,
     89             'issuer'        : issuer,
     90             'alt_names'     : alt_names,
     91             'not_before'    : cert.not_valid_before_utc.isoformat(),
     92             'not_after'     : cert.not_valid_after_utc.isoformat(),
     93             'version'       : cert.version.value,
     94             'serial_number' : format(cert.serial_number, 'x'),
     95         }
     96     except Exception as e:
     97         error(f'Error getting cert info for {url}: {str(e)}')
     98         return None
     99 
    100 
    101 async def get_favicon_hash(session, base_url: str, html: str) -> str:
    102     '''
    103     Get favicon hash from a webpage
    104     
    105     :param session: aiohttp client session
    106     :param base_url: base URL of the website
    107     :param html: HTML content of the page
    108     '''
    109 
    110     try:
    111         soup = bs4.BeautifulSoup(html, 'html.parser')
    112         
    113         favicon_url = None
    114         for link in soup.find_all('link'):
    115             if link.get('rel') and any(x.lower() == 'icon' for x in link.get('rel')):
    116                 favicon_url = link.get('href')
    117                 break
    118         
    119         if not favicon_url:
    120             favicon_url = '/favicon.ico'
    121         
    122         if favicon_url.startswith('//'):
    123             favicon_url = 'https:' + favicon_url
    124         elif favicon_url.startswith('/'):
    125             favicon_url = base_url + favicon_url
    126         elif not favicon_url.startswith(('http://', 'https://')):
    127             favicon_url = base_url + '/' + favicon_url
    128 
    129         async with session.get(favicon_url, timeout=10) as response:
    130             if response.status == 200:
    131                 content    = (await response.read())[:1024*1024]
    132                 hash_value = mmh3.hash64(content)[0]
    133                 if hash_value != 0:
    134                     return str(hash_value)
    135 
    136     except Exception as e:
    137         debug(f'Error getting favicon for {base_url}: {str(e)}')
    138     
    139     return None 
    140 
    141 
    142 def parse_status_codes(codes_str: str) -> set:
    143     '''
    144     Parse comma-separated status codes and ranges into a set of integers
    145     
    146     :param codes_str: Comma-separated status codes (e.g., "200,301-399,404,500-503")
    147     '''
    148     
    149     codes = set()
    150     try:
    151         for part in codes_str.split(','):
    152             if '-' in part:
    153                 start, end = map(int, part.split('-'))
    154                 codes.update(range(start, end + 1))
    155             else:
    156                 codes.add(int(part))
    157         return codes
    158     except ValueError:
    159         raise argparse.ArgumentTypeError('Invalid status code format. Use comma-separated numbers or ranges (e.g., 200,301-399,404,500-503)')
    160 
    161 
    162 def parse_shard(shard_str: str) -> tuple:
    163     '''
    164     Parse shard argument in format INDEX/TOTAL
    165     
    166     :param shard_str: Shard string in format "INDEX/TOTAL"
    167     '''
    168 
    169     try:
    170         shard_index, total_shards = map(int, shard_str.split('/'))
    171         if shard_index < 1 or total_shards < 1 or shard_index > total_shards:
    172             raise ValueError
    173         return shard_index - 1, total_shards  # Convert to 0-based index
    174     except (ValueError, TypeError):
    175         raise argparse.ArgumentTypeError('Shard must be in format INDEX/TOTAL where INDEX <= TOTAL') 
    176 
    177 
    178 def parse_title(html: str, content_type: str = None) -> str:
    179     '''
    180     Parse title from HTML content
    181     
    182     :param html: HTML content of the page
    183     :param content_type: Content-Type header value
    184     '''
    185 
    186     # Only parse title for HTML content
    187     if content_type and not any(x in content_type.lower() for x in ['text/html', 'application/xhtml']):
    188         return None
    189         
    190     try:
    191         soup = bs4.BeautifulSoup(html, 'html.parser', from_encoding='utf-8', features='lxml')
    192         if title := soup.title:
    193             return title.string.strip()
    194     except:
    195         pass
    196     
    197     return None