import os
import csv
import json
import threading
import requests
from bs4 import BeautifulSoup
from urllib.parse import unquote
from pathlib import Path

# === CONFIG ===
DOWNLOAD_DIR = Path("downloads")
DOWNLOAD_DIR.mkdir(exist_ok=True)
allowed_types = ['pdf', 'doc', 'docx']

military_contractors = {
    "1": ("Boeing", "boeing.com"),
    "2": ("Raytheon", "rtx.com"),
    "3": ("General Dynamics", "gd.com"),
    "4": ("Lockheed Martin", "lockheedmartin.com"),
    "5": ("Northrop Grumman", "northropgrumman.com"),
    "6": ("All", "all")
}

def build_yahoo_url(query, domain, page):
    return f"https://search.yahoo.com/search?p=site%3A{domain}+{query}&b={page*10+1}"

def extract_links(domain, query, pages=2):
    results = set()
    headers = {"User-Agent": "Mozilla/5.0"}
    for page in range(pages):
        url = build_yahoo_url(query, domain, page)
        try:
            response = requests.get(url, headers=headers, timeout=5)
            soup = BeautifulSoup(response.text, 'html.parser')
            for a in soup.find_all('a', href=True):
                href = a['href'].split("?")[0]
                if domain in href:
                    results.add(href)
        except Exception as e:
            print(f"[!] Error fetching from Yahoo: {e}")
    return list(results)

def expand_redirect(url):
    try:
        if "r.search.yahoo.com" in url:
            r = requests.get(url, allow_redirects=True, timeout=5)
            return r.url
        return url
    except:
        return url

def save_links_to_csv(domain, links):
    csv_path = Path(f"{domain}.csv")
    existing = set()
    if csv_path.exists():
        with open(csv_path, 'r') as f:
            reader = csv.reader(f)
            existing = set(row[0] for row in reader if row)
    new_links = set(expand_redirect(l) for l in links) - existing
    if new_links:
        with open(csv_path, 'a', newline='') as f:
            writer = csv.writer(f)
            for link in new_links:
                writer.writerow([link])
        print(f"[+] Appended {len(new_links)} new links to {csv_path}")
    else:
        print(f"[=] No new links to append to {csv_path}")
    return list(existing.union(new_links))

def combine_all_csvs_to_json():
    combined_links = set()
    for csv_file in Path(".").glob("*.csv"):
        with open(csv_file, "r") as f:
            reader = csv.reader(f)
            for row in reader:
                if row:
                    link = expand_redirect(row[0].strip())
                    combined_links.add(link)
    with open("all_links.json", "w") as f:
        json.dump(sorted(combined_links), f, indent=2)
    print("[✓] Combined all CSVs into all_links.json")

def download_file(url):
    try:
        ext = url.split('.')[-1].lower()
        folder = DOWNLOAD_DIR / ext
        folder.mkdir(parents=True, exist_ok=True)
        filename = url.split("/")[-1]
        filepath = folder / filename
        if filepath.exists():
            return
        cmd = f"wget -t 2 --timeout=5 -nc -P \"{folder}\" \"{url}\""
        os.system(cmd)
    except Exception as e:
        print(f"[!] Error downloading {url}: {e}")

def threaded_download_from_json(filetypes):
    with open("all_links.json", "r") as f:
        all_links = json.load(f)
    valid_links = [l for l in all_links if any(l.lower().endswith(f".{t}") for t in filetypes)]
    threads = []
    for url in valid_links:
        t = threading.Thread(target=download_file, args=(url,))
        t.start()
        threads.append(t)
    for t in threads:
        t.join()
    print("[✓] All downloads complete.")

def main():
    print("==== Military Contractor Yahoo Scraper ====")
    for k, v in military_contractors.items():
        print(f"{k}. {v[0]}")
    choice = input("Choose contractor (1–6): ").strip()
    query = input("Search keyword: ").strip()
    selected_types = input("Filetypes to download (pdf,doc,docx): ").lower().split(',')
    selected_types = [t.strip() for t in selected_types if t.strip() in allowed_types]
    if not selected_types:
        print("[-] No valid filetypes selected.")
        return
    targets = (
        list(military_contractors.values())[:-1] if choice == "6"
        else [military_contractors.get(choice)]
    )
    all_links = []
    for name, domain in targets:
        print(f"[*] Searching Yahoo for {domain}...")
        links = extract_links(domain, query)
        if links:
            merged = save_links_to_csv(domain, links)
            all_links.extend(merged)
    combine_all_csvs_to_json()
    threaded_download_from_json(selected_types)

if __name__ == "__main__":
    main()
