"""
DORK ENGINE: Gov Menu + Batch Mode Edition
Scan individual or all .gov/.mil domains for keyword-matching filetypes (.pdf, .docx, .doc)
"""

import requests
from bs4 import BeautifulSoup
import csv
import os
import time
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import quote_plus, urlparse, unquote, parse_qs

HEADERS = {"User-Agent": "Mozilla/5.0"}

GOV_DOMAINS = {
    "1": ("White House", "whitehouse.gov"),
    "2": ("CIA", "cia.gov"),
    "3": ("NSA", "nsa.gov"),
    "4": ("FBI", "fbi.gov"),
    "5": ("US Army", "army.mil"),
    "6": ("US Navy", "navy.mil"),
    "7": ("US Air Force", "af.mil"),
    "8": ("US Marine Corps", "marines.mil"),
    "9": ("NASA", "nasa.gov"),
    "10": ("Department of State", "state.gov"),
    "11": ("Department of Justice", "justice.gov"),
    "12": ("Department of Homeland Security", "dhs.gov"),
    "13": ("National Guard", "nationalguard.mil"),
    "14": ("Louisiana.gov", "louisiana.gov"),
    "15": ("California.gov", "ca.gov"),
    "16": ("New York.gov", "ny.gov"),
    "17": ("Texas.gov", "texas.gov"),
    "18": ("Florida.gov", "florida.gov"),
    "19": ("Custom Domain", None),
    "20": ("Batch Mode (ALL ABOVE)", "BATCH")
}

def build_search_urls(query, page):
    return {
        "yahoo": f"https://search.yahoo.com/search?p={quote_plus(query)}&b={page * 10 + 1}",
        "bing": f"https://www.bing.com/search?q={quote_plus(query)}&first={page * 10 + 1}",
        "duckduckgo": f"https://html.duckduckgo.com/html/?q={quote_plus(query)}&s={page * 10}",
        "google": f"https://www.google.com/search?q={quote_plus(query)}&start={page * 10}"
    }

def extract_true_url_yahoo(href):
    parsed = urlparse(href)
    if "r.search.yahoo.com" in parsed.netloc and "/RU=" in href:
        try:
            start = href.find("/RU=") + 4
            end = href.find("/RK=")
            return unquote(href[start:end])
        except Exception:
            pass
    return href

def extract_true_url_google(href):
    parsed = urlparse(href)
    if parsed.path == "/url" and "q" in parse_qs(parsed.query):
        try:
            return parse_qs(parsed.query).get("q")[0]
        except Exception:
            pass
    return href

def extract_true_url_bing(href):
    parsed = urlparse(href)
    if "bing.com" in parsed.netloc and "/aclick" in parsed.path:
        if "url" in parse_qs(parsed.query):
            return parse_qs(parsed.query).get("url")[0]
    return href

def parse_results(html, engine):
    soup = BeautifulSoup(html, "html.parser")
    links = set()
    for a_tag in soup.find_all("a", href=True):
        href = a_tag['href']
        if engine == "yahoo":
            href = extract_true_url_yahoo(href)
        elif engine == "google":
            href = extract_true_url_google(href)
        elif engine == "bing":
            href = extract_true_url_bing(href)

        if any(ext in href.lower() for ext in [".pdf", ".doc", ".docx"]):
            links.add(href)
    return list(links)

def deduplicate_links(links):
    return list(set(links))

def save_links_to_csv(links, domain, keyword):
    output_dir = "./exports"
    os.makedirs(output_dir, exist_ok=True)
    safe_keyword = keyword.replace(" ", "_")
    output_file = os.path.join(output_dir, f"{domain.replace('.', '_')}_{safe_keyword}.csv")
    existing = set()
    if os.path.exists(output_file):
        with open(output_file, newline='', encoding='utf-8') as f:
            reader = csv.reader(f)
            for row in reader:
                existing.add(row[0])
    combined = deduplicate_links(links + list(existing))
    with open(output_file, "w", newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        for link in combined:
            writer.writerow([link])
    return output_file, combined

def download_file(link, domain):
    filetype = "misc"
    filename = link.split("/")[-1].split("?")[0]
    if ".pdf" in filename.lower():
        filetype = "pdf"
    elif ".docx" in filename.lower():
        filetype = "docx"
    elif ".doc" in filename.lower():
        filetype = "doc"
    safe_domain = domain.replace('.', '_')
    download_dir = f"./downloads/{safe_domain}/{filetype}/"
    os.makedirs(download_dir, exist_ok=True)
    filepath = os.path.join(download_dir, filename)

    if os.path.exists(filepath):
        return

    tries = 2
    for attempt in range(tries):
        try:
            print(f"[DL] {link} (try {attempt+1})")
            r = requests.get(link, headers=HEADERS, timeout=5)
            with open(filepath, "wb") as f:
                f.write(r.content)
            return
        except Exception as e:
            print(f"[!] Failed attempt {attempt+1} for {link}: {e}")
    print(f"[x] Skipped after {tries} failed attempts: {link}")

def download_files_multithreaded(links, domain):
    links = deduplicate_links(links)
    with ThreadPoolExecutor(max_workers=6) as executor:
        for link in links:
            executor.submit(download_file, link, domain)

def run_dork_scan(domain, keyword):
    query = f"site:{domain} {keyword} filetype:pdf OR filetype:docx OR filetype:doc"
    all_links = []
    for page in range(0, 20):
        print(f"\n[+] Scraping page {page + 1}")
        urls = build_search_urls(query, page)
        for engine, url in urls.items():
            print(f"[{engine.upper()}] {url}")
            try:
                r = requests.get(url, headers=HEADERS, timeout=10)
                links = parse_results(r.text, engine)
                all_links.extend(links)
                time.sleep(1)
            except Exception as e:
                print(f"[!] Error scraping {engine}: {e}")
    unique_links = deduplicate_links(all_links)
    output_file, links_saved = save_links_to_csv(unique_links, domain, keyword)
    download_files_multithreaded(links_saved, domain)

def menu():
    print("Select a government/military domain to scan:")
    for key, (label, _) in GOV_DOMAINS.items():
        print(f"{key}. {label}")
    choice = input("Enter choice number: ").strip()
    label, domain = GOV_DOMAINS.get(choice, ("Invalid", None))

    if domain is None and choice == "19":
        domain = input("Enter your custom domain: ").strip()
        keyword = input("Enter keyword(s) to search for: ").strip()
        run_dork_scan(domain, keyword)
    elif domain == "BATCH":
        keyword = input("Enter keyword(s) to batch scan all domains for: ").strip()
        for key, (label, d) in GOV_DOMAINS.items():
            if d and d != "BATCH":
                print(f"\n====== Running for {label} ({d}) ======")
                run_dork_scan(d, keyword)
    elif domain:
        keyword = input("Enter keyword(s) to search for: ").strip()
        run_dork_scan(domain, keyword)
    else:
        print("Invalid selection.")

if __name__ == "__main__":
    menu()
