import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, parse_qs
import re
import time
import csv
import random
from collections import OrderedDict
import os

# List of user agents for rotating headers
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Mobile/15E148 Safari/604.1"
]

def get_random_headers():
    """Return a dictionary with a random User-Agent."""
    return {
        "User-Agent": random.choice(USER_AGENTS),
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Connection": "keep-alive"
    }

def resolve_redirected_url(url):
    """Follow redirects and handle search engine URL obfuscation."""
    try:
        parsed = urlparse(url)
        if parsed.hostname == "www.google.com" and parsed.path == "/url":
            query_params = parse_qs(parsed.query)
            if 'q' in query_params:
                url = query_params['q'][0]
        elif parsed.hostname and parsed.hostname.endswith("yahoo.com") and parsed.path.startswith("/d/search"):
            query_params = parse_qs(parsed.query)
            if 'u' in query_params:
                url = query_params['u'][0]
        response = requests.head(url, headers=get_random_headers(), allow_redirects=True, timeout=5)
        return response.url
    except requests.RequestException:
        return url

def is_valid_url(url):
    """Check if the URL is valid."""
    try:
        result = requests.head(url, headers=get_random_headers(), allow_redirects=True, timeout=5)
        return result.status_code == 200
    except requests.RequestException:
        return False

def extract_subdomains(url):
    """Extract subdomains from a given URL."""
    parsed = urlparse(url)
    domain_parts = parsed.hostname.split('.')
    subdomains = set()
    if len(domain_parts) > 2:
        for i in range(len(domain_parts) - 2):
            subdomain = '.'.join(domain_parts[i:]).strip()
            subdomains.add(subdomain)
    return subdomains

def search_subdomains(domain, max_pages=2):
    """Search for subdomains using Google, Bing, Yahoo, and Yandex with pagination."""
    subdomains = set()

    # Google search (direct scraping)
    for page in range(1, max_pages + 1):
        google_query = f"https://www.google.com/search?q=site%3A*.{domain}+-inurl%3A(www.{domain})&start={(page-1)*10}"
        try:
            response = requests.get(google_query, headers=get_random_headers(), timeout=5)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'lxml')
            for link in soup.find_all('a', href=True):
                href = link['href']
                resolved_url = resolve_redirected_url(href)
                parsed = urlparse(resolved_url)
                if parsed.hostname and parsed.hostname.endswith(domain):
                    subdomains.add(parsed.hostname)
            time.sleep(1)
        except Exception as e:
            print(f"Error searching Google (page {page}) for subdomains of {domain}: {e}")

    # Bing search
    for page in range(1, max_pages + 1):
        bing_query = f"https://www.bing.com/search?q=site%3A*.{domain}+-inurl%3A(www.{domain})&first={(page-1)*10+1}"
        try:
            response = requests.get(bing_query, headers=get_random_headers(), timeout=5)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'lxml')
            for link in soup.find_all('a', href=True):
                href = link['href']
                resolved_url = resolve_redirected_url(href)
                parsed = urlparse(resolved_url)
                if parsed.hostname and parsed.hostname.endswith(domain):
                    subdomains.add(parsed.hostname)
            time.sleep(1)
        except Exception as e:
            print(f"Error searching Bing (page {page}) for subdomains of {domain}: {e}")

    # Yahoo search
    for page in range(1, max_pages + 1):
        yahoo_query = f"https://search.yahoo.com/search?p=site%3A*.{domain}+-inurl%3A(www.{domain})&b={(page-1)*10+1}"
        try:
            response = requests.get(yahoo_query, headers=get_random_headers(), timeout=5)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'lxml')
            for link in soup.find_all('a', href=True):
                href = link['href']
                resolved_url = resolve_redirected_url(href)
                parsed = urlparse(resolved_url)
                if parsed.hostname and parsed.hostname.endswith(domain):
                    subdomains.add(parsed.hostname)
            time.sleep(1)
        except Exception as e:
            print(f"Error searching Yahoo (page {page}) for subdomains of {domain}: {e}")

    # Yandex search
    for page in range(1, max_pages + 1):
        yandex_query = f"https://yandex.com/search/?text=site%3A*.{domain}+-inurl%3Awww.{domain}&p={page-1}"
        try:
            response = requests.get(yandex_query, headers=get_random_headers(), timeout=5)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'lxml')
            for link in soup.find_all('a', href=True):
                href = link['href']
                resolved_url = resolve_redirected_url(href)
                parsed = urlparse(resolved_url)
                if parsed.hostname and parsed.hostname.endswith(domain):
                    subdomains.add(parsed.hostname)
            time.sleep(1)
        except Exception as e:
            print(f"Error searching Yandex (page {page}) for subdomains of {domain}: {e}")

    return subdomains

def crawl_url(url, visited=None, subdomain_queue=None, all_urls=None, depth=0, max_depth=2):
    """Crawl a URL, extract domains, subdomains, folders, and collect all URLs."""
    if visited is None:
        visited = set()
    if subdomain_queue is None:
        subdomain_queue = set()
    if all_urls is None:
        all_urls = set()

    if depth > max_depth or url in visited or not is_valid_url(url):
        return set(), set(), set(), subdomain_queue, all_urls

    visited.add(url)
    domains = set()
    subdomains = set()
    folders = set()
    all_urls.add(url)

    try:
        response = requests.get(url, headers=get_random_headers(), timeout=5)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'lxml')
        parsed_url = urlparse(url)
        domain = parsed_url.hostname
        domains.add(domain)
        subdomains.update(extract_subdomains(url))

        path = parsed_url.path
        if path and path != '/':
            folders.add(path)

        for link in soup.find_all('a', href=True):
            href = link['href']
            absolute_url = urljoin(url, href)
            parsed_absolute = urlparse(absolute_url)

            if parsed_absolute.hostname and parsed_absolute.hostname.endswith(domain.split('.')[-2] + '.' + domain.split('.')[-1]):
                all_urls.add(absolute_url)
                if parsed_absolute.path and parsed_absolute.path != '/':
                    folders.add(parsed_absolute.path)
                if parsed_absolute.hostname != domain:
                    subdomain_queue.add(f"https://{parsed_absolute.hostname}")
                new_domains, new_subdomains, new_folders, subdomain_queue, all_urls = crawl_url(
                    absolute_url, visited, subdomain_queue, all_urls, depth + 1, max_depth
                )
                domains.update(new_domains)
                subdomains.update(new_subdomains)
                folders.update(new_folders)

    except requests.RequestException as e:
        print(f"Error crawling {url}: {e}")

    return domains, subdomains, folders, subdomain_queue, all_urls

def export_to_csv(domains, subdomains, folders, all_urls, filename="crawler_results.csv", urls_filename="downloadable_urls.txt"):
    """Export deduplicated results to a CSV file and URLs to a text file."""
    output_dir = os.path.expanduser("~/storage/shared/")
    filename = os.path.join(output_dir, filename)
    urls_filename = os.path.join(output_dir, urls_filename)

    domains = list(OrderedDict.fromkeys(domains))
    subdomains = list(OrderedDict.fromkeys(subdomains))
    folders = list(OrderedDict.fromkeys(folders))
    all_urls = list(OrderedDict.fromkeys(all_urls))

    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["Type", "Value"])
        for domain in domains:
            writer.writerow(["Domain", domain])
        for subdomain in subdomains:
            writer.writerow(["Subdomain", subdomain])
        for folder in folders:
            writer.writerow(["Folder", folder])
    print(f"Results exported to {filename}")

    with open(urls_filename, 'w', encoding='utf-8') as f:
        for url in all_urls:
            f.write(url + '\n')
    print(f"Downloadable URLs exported to {urls_filename}")

def crawl_and_process(start_urls):
    """Process single or batch URLs, crawl subdomains, and export results."""
    all_domains = set()
    all_subdomains = set()
    all_folders = set()
    subdomain_queue = set()
    all_urls = set()

    for start_url in start_urls:
        if not start_url.startswith(('http://', 'https://')):
            start_url = f"https://{start_url}"
        
        parsed = urlparse(start_url)
        domain = parsed.hostname
        if domain:
            print(f"Searching subdomains for {domain}...")
            subdomain_queue.update(search_subdomains(domain.split('.')[-2] + '.' + domain.split('.')[-1]))
        
        domains, subdomains, folders, subdomain_queue, all_urls = crawl_url(start_url, subdomain_queue=subdomain_queue, all_urls=all_urls)
        all_domains.update(domains)
        all_subdomains.update(subdomains)
        all_folders.update(folders)

    while subdomain_queue:
        subdomain_url = subdomain_queue.pop()
        print(f"Crawling subdomain: {subdomain_url}")
        domains, subdomains, folders, subdomain_queue, all_urls = crawl_url(subdomain_url, all_urls=all_urls)
        all_domains.update(domains)
        all_subdomains.update(subdomains)
        all_folders.update(folders)

    export_to_csv(all_domains, all_subdomains, all_folders, all_urls)

    return all_domains, all_subdomains, all_folders, all_urls

def main():
    """CLI interface for Termux."""
    print("Web Crawler for Termux")
    print("Enter a single URL or 'batch' to use batchurls.txt")
    print("Example: https://example.com or batch")
    
    user_input = input("Enter URL or 'batch': ").strip()
    start_urls = []

    if user_input.lower() == 'batch':
        batch_file = os.path.expanduser("~/storage/shared/batchurls.txt")
        if os.path.exists(batch_file):
            with open(batch_file, 'r') as f:
                start_urls = [line.strip() for line in f if line.strip()]
            print(f"Loaded {len(start_urls)} URLs from {batch_file}")
        else:
            print(f"Error: {batch_file} not found.")
            return
    else:
        start_urls = [user_input]

    try:
        print("Crawling started...")
        domains, subdomains, folders, all_urls = crawl_and_process(start_urls)
        print(f"\nResults:\nDomains: {len(domains)}\nSubdomains: {len(subdomains)}\nFolders: {len(folders)}\nURLs for download: {len(all_urls)}")
        print("Results saved to ~/storage/shared/crawler_results.csv")
        print("URLs saved to ~/storage/shared/downloadable_urls.txt")
    except Exception as e:
        print(f"Crawling failed: {e}")

if __name__ == "__main__":
    main()
