import requests, csv, os, re, time, threading, zipfile
from urllib.parse import unquote, urlparse
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

# === CONFIG ===
headers = {"User-Agent": "Mozilla/5.0"}
output_folder = "downloads"
os.makedirs(output_folder, exist_ok=True)
visited_urls = set()
lock = threading.Lock()

def get_actual_url(yahoo_url):
    match = re.search(r'RU=(.+?)/RK', yahoo_url)
    return unquote(match.group(1)) if match else None

def parse_yahoo_results(query, max_pages=50):
    results = []
    for page in range(max_pages):
        url = f"https://search.yahoo.com/search?p={query}&b={page * 10 + 1}"
        try:
            r = requests.get(url, headers=headers, timeout=10)
            soup = BeautifulSoup(r.text, 'html.parser')
            links = soup.find_all('a', href=True)
            for link in links:
                if '/RU=' in link['href']:
                    actual_url = get_actual_url(link['href'])
                    if actual_url and actual_url not in visited_urls:
                        visited_urls.add(actual_url)
                        results.append(actual_url)
            time.sleep(1)
        except Exception:
            break
    return results

def extract_subdomains_and_paths(urls, base_domain):
    subdomains = set()
    folders = set()
    for url in urls:
        parsed = urlparse(url)
        if parsed.hostname and base_domain in parsed.hostname and parsed.hostname != base_domain:
            subdomains.add(parsed.hostname)
        path_parts = parsed.path.strip("/").split('/')
        for i in range(1, len(path_parts)):
            folders.add('/' + '/'.join(path_parts[:i]) + '/')
    return subdomains, folders

def save_to_csv(csv_path, urls):
    existing = set()
    if os.path.exists(csv_path):
        with open(csv_path, 'r') as f:
            reader = csv.reader(f)
            existing = {row[0] for row in reader}
    with open(csv_path, 'a', newline='') as f:
        writer = csv.writer(f)
        for url in urls:
            if url not in existing:
                writer.writerow([url])
                existing.add(url)

def download_file(url, filetype):
    try:
        if not url.lower().endswith(f".{filetype}"):
            return
        file_name = os.path.join(output_folder, os.path.basename(urlparse(url).path))
        if os.path.exists(file_name):
            return
        r = requests.get(url, headers=headers, timeout=10)
        if r.status_code == 200:
            with open(file_name, 'wb') as f:
                f.write(r.content)
            print(f"[+] Downloaded: {file_name}")
    except Exception as e:
        print(f"[!] Failed to download {url}: {e}")

def zip_downloads(folder):
    zipf = zipfile.ZipFile('downloads.zip', 'w', zipfile.ZIP_DEFLATED)
    for root, _, files in os.walk(folder):
        for file in files:
            zipf.write(os.path.join(root, file), arcname=file)
    zipf.close()
    print("[*] Downloads zipped into downloads.zip")

def build_enhanced_yahoo_dorker(base_query, filetype, base_domain, max_pages=10, do_zip=True):
    base_query_full = f"{base_query} filetype:{filetype}"
    print(f"[*] Crawling base query: {base_query_full}")
    all_urls = parse_yahoo_results(base_query_full, max_pages)

    subdomains, folders = extract_subdomains_and_paths(all_urls, base_domain)
    print(f"[+] Found {len(subdomains)} subdomains and {len(folders)} folders")

    for sub in subdomains:
        q = f"site:{sub} filetype:{filetype}"
        print(f"[*] Crawling subdomain: {q}")
        all_urls += parse_yahoo_results(q, max_pages)

    for folder in folders:
        q = f"site:{base_domain}{folder} filetype:{filetype}"
        print(f"[*] Crawling folder path: {q}")
        all_urls += parse_yahoo_results(q, max_pages)

    # Filter URLs for correct filetype
    filtered_urls = [u for u in all_urls if u.lower().endswith(f".{filetype}")]
    print(f"[+] {len(filtered_urls)} URLs matched filetype .{filetype}")

    # Save to CSV
    csv_file = f"results_{filetype}.csv"
    save_to_csv(csv_file, filtered_urls)

    # Download in parallel
    print("[*] Downloading files...")
    with ThreadPoolExecutor(max_workers=8) as executor:
        executor.map(lambda url: download_file(url, filetype), filtered_urls)

    if do_zip:
        zip_downloads(output_folder)

    print(f"[✓] Completed. CSV: {csv_file}")
    return csv_file, filtered_urls

if __name__ == "__main__":
    print("=== Yahoo Dork File Crawler ===")
    base_query = input("Enter your dork (e.g. site:example.com): ").strip()
    filetype = input("Enter filetype to search for (e.g. pdf, docx): ").strip().lower()
    base_domain = input("Enter base domain (e.g. example.com): ").strip()
    pages = int(input("How many pages to scan per query (e.g. 20): ").strip())
    do_zip = input("Zip downloads afterwards? (y/n): ").strip().lower() == 'y'

    build_enhanced_yahoo_dorker(base_query, filetype, base_domain, max_pages=pages, do_zip=do_zip)
