import os
import csv
import threading
import requests
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import pandas as pd
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
import logging
import time
import re

# === CONFIG ===
DOWNLOAD_DIR = Path("downloads")
DOWNLOAD_DIR.mkdir(exist_ok=True)

military_contractors = {
    "1": ("Boeing", "boeing.com"),
    "2": ("Raytheon", "rtx.com"),
    "3": ("General Dynamics", "gd.com"),
    "4": ("Lockheed Martin", "lockheedmartin.com"),
    "5": ("Northrop Grumman", "northropgrumman.com"),
    "6": ("All", "all")
}

allowed_types = ['pdf', 'doc', 'docx']
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

def build_yahoo_url(query, domain, page):
    return f"https://search.yahoo.com/search?p=site%3A{domain}+{query}&b={page*10+1}"

def validate_query(query):
    if not re.match(r'^[a-zA-Z0-9\s\-+]+$', query):
        raise ValueError("Invalid characters in query")
    return query

def extract_links(domain, query, pages=2):
    results = set()
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
    for page in range(pages):
        url = build_yahoo_url(query, domain, page)
        try:
            response = requests.get(url, headers=headers, timeout=5)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            for a in soup.find_all('a', href=True):
                href = a['href']
                if domain in href and any(href.lower().endswith(f".{ext}") for ext in allowed_types):
                    results.add(href)
            time.sleep(1)  # Rate limiting
        except Exception as e:
            logging.error(f"Error fetching from Yahoo for {url}: {e}")
    return list(results)

def save_links_to_csv(domain, links):
    csv_path = Path(f"{domain}.csv")
    df_new = pd.DataFrame(links, columns=['url'])
    if csv_path.exists():
        df_old = pd.read_csv(csv_path)
        combined = pd.concat([df_old, df_new]).drop_duplicates(subset='url')
    else:
        combined = df_new.drop_duplicates(subset='url')
    combined.to_csv(csv_path, index=False)
    logging.info(f"Saved {len(combined)} unique links to {csv_path}")
    return combined['url'].tolist()

def download_file(url, filetype):
    try:
        folder = DOWNLOAD_DIR / filetype
        folder.mkdir(parents=True, exist_ok=True)
        filename = urlparse(url).path.split("/")[-1]
        filepath = folder / filename
        if filepath.exists():
            logging.info(f"Skipping {url}: already exists")
            return
        response = requests.get(url, stream=True, timeout=5)
        response.raise_for_status()
        with open(filepath, 'wb') as f:
            for chunk in response.iter_content(1024):
                f.write(chunk)
        logging.info(f"Downloaded {url}")
    except Exception as e:
        logging.error(f"Error downloading {url}: {e}")

def threaded_download(links, filetypes):
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [
            executor.submit(download_file, link, ext)
            for link in links
            for ext in filetypes
            if link.lower().endswith(f".{ext}")
        ]

def main():
    print("==== Military Contractor Scraper ====")
    for k, v in military_contractors.items():
        print(f"{k}. {v[0]}")
    choice = input("Choose contractor (1-6): ").strip()
    if choice not in military_contractors:
        logging.error("Invalid choice.")
        return

    try:
        query = validate_query(input("Enter search keyword: ").strip())
    except ValueError as e:
        logging.error(e)
        return

    selected_types = input("File types to download (comma separated, e.g., pdf,docx): ").lower().split(',')
    selected_types = [t.strip() for t in selected_types if t.strip() in allowed_types]
    if not selected_types:
        logging.error("No valid filetypes selected.")
        return

    targets = (
        list(military_contractors.values())[:-1] if choice == "6"
        else [military_contractors.get(choice)]
    )

    all_links = []
    for name, domain in targets:
        logging.info(f"Searching Yahoo for {domain}...")
        links = extract_links(domain, query)
        if links:
            unique_links = save_links_to_csv(domain, links)
            all_links.extend(unique_links)

    if not all_links:
        logging.warning("No links found.")
        return

    logging.info(f"Starting downloads for {len(all_links)} links...")
    threaded_download(all_links, selected_types)
    logging.info("Done.")

if __name__ == "__main__":
    main()
