import os
import csv
import threading
import requests
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import pandas as pd
from pathlib import Path

# === CONFIG ===
DOWNLOAD_DIR = Path("downloads")
DOWNLOAD_DIR.mkdir(exist_ok=True)

military_contractors = {
    "1": ("Boeing", "boeing.com"),
    "2": ("Raytheon", "rtx.com"),
    "3": ("General Dynamics", "gd.com"),
    "4": ("Lockheed Martin", "lockheedmartin.com"),
    "5": ("Northrop Grumman", "northropgrumman.com"),
    "6": ("All", "all")
}

allowed_types = ['pdf', 'doc', 'docx']

def build_yahoo_url(query, domain, page):
    return f"https://search.yahoo.com/search?p=site%3A{domain}+{query}&b={page*10+1}"

def extract_links(domain, query, pages=2):
    results = set()
    headers = {"User-Agent": "Mozilla/5.0"}
    for page in range(pages):
        url = build_yahoo_url(query, domain, page)
        try:
            response = requests.get(url, headers=headers, timeout=5)
            soup = BeautifulSoup(response.text, 'html.parser')
            for a in soup.find_all('a', href=True):
                href = a['href'].split("?")[0]
                if domain in href:
                    results.add(href)
        except Exception as e:
            print(f"[!] Error fetching from Yahoo: {e}")
    return list(results)

def save_links_to_csv(domain, links):
    csv_path = Path(f"{domain}.csv")
    df_new = pd.DataFrame(links)
    if csv_path.exists():
        df_old = pd.read_csv(csv_path, header=None)
        combined = pd.concat([df_old, df_new]).drop_duplicates()
    else:
        combined = df_new.drop_duplicates()
    combined.to_csv(csv_path, index=False, header=False)
    print(f"[+] Saved {len(combined)} unique links to {csv_path}")
    return combined[0].tolist()

def download_file(url, filetype):
    try:
        folder = DOWNLOAD_DIR / filetype
        folder.mkdir(parents=True, exist_ok=True)
        filename = url.split("/")[-1]
        filepath = folder / filename
        if filepath.exists():
            return
        cmd = f"wget -t 2 --timeout=5 -nc -P \"{folder}\" \"{url}\""
        os.system(cmd)
    except Exception as e:
        print(f"[!] Error downloading {url}: {e}")

def threaded_download(links, filetypes):
    threads = []
    for link in links:
        for ext in filetypes:
            if link.lower().endswith(f".{ext}"):
                t = threading.Thread(target=download_file, args=(link, ext))
                t.start()
                threads.append(t)
    for t in threads:
        t.join()

def main():
    print("==== Military Contractor Scraper ====")
    for k, v in military_contractors.items():
        print(f"{k}. {v[0]}")
    choice = input("Choose contractor (1-6): ").strip()

    query = input("Enter search keyword: ").strip()

    selected_types = input("File types to download (comma separated, e.g., pdf,docx): ").lower().split(',')
    selected_types = [t.strip() for t in selected_types if t.strip() in allowed_types]

    if not selected_types:
        print("[-] No valid filetypes selected.")
        return

    targets = (
        list(military_contractors.values())[:-1] if choice == "6"
        else [military_contractors.get(choice)]
    )

    all_links = []
    for name, domain in targets:
        print(f"[*] Searching Yahoo for {domain}...")
        links = extract_links(domain, query)
        if links:
            unique_links = save_links_to_csv(domain, links)
            all_links.extend(unique_links)

    if not all_links:
        print("[-] No links found.")
        return

    print(f"[+] Starting downloads for {len(all_links)} links...")
    threaded_download(all_links, selected_types)
    print("[✓] Done.")

if __name__ == "__main__":
    main()
