
#!/usr/bin/env python3
"""
Archiver for #OpTanzania
- Walks a directory, submits URLs found in JSONL to web archives (Wayback/Archive.today)
- Stores results alongside records
"""
import os, sys, json, argparse, concurrent.futures, time, requests
from urllib.parse import urlparse

WAYBACK_SAVE = "https://web.archive.org/save/"
ARCHIVE_TODAY = "https://archive.today/submit/"  # Often blocks automation.

def iter_jsonl_files(root):
    for dirpath, _, filenames in os.walk(root):
        for fn in filenames:
            if fn.endswith(".jsonl"):
                yield os.path.join(dirpath, fn)

def extract_urls_from_jsonl(path):
    urls = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                obj = json.loads(line)
                for k, v in obj.items():
                    if isinstance(v, str) and v.startswith("http"):
                        urls.append(v)
                    if isinstance(v, dict):
                        for vv in v.values():
                            if isinstance(vv, str) and vv.startswith("http"):
                                urls.append(vv)
            except Exception:
                continue
    return list(dict.fromkeys(urls))  # dedupe, preserve order

def save_wayback(url):
    try:
        r = requests.get(WAYBACK_SAVE + url, timeout=30)
        return {"url": url, "status": r.status_code, "archive": r.headers.get("Content-Location", "")}
    except Exception as e:
        return {"url": url, "error": str(e)}

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--path", required=True, help="Root path to scan for JSONL")
    ap.add_argument("--concurrency", type=int, default=2)
    ap.add_argument("--wayback", action="store_true", help="Force Wayback even if config disabled")
    args = ap.parse_args()

    all_urls = set()
    for jf in iter_jsonl_files(args.path):
        all_urls.update(extract_urls_from_jsonl(jf))

    print(f"[archiver] Found {len(all_urls)} URLs to archive via Wayback")

    results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=args.concurrency) as ex:
        futs = [ex.submit(save_wayback, u) for u in all_urls]
        for fu in concurrent.futures.as_completed(futs):
            results.append(fu.result())
            if len(results) % 10 == 0:
                print(f"[archiver] processed {len(results)}/{len(all_urls)}")

    # Store results
    outp = os.path.join(args.path, "archive_results.json")
    with open(outp, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    print(f"[archiver] Saved results to {outp}")

if __name__ == "__main__":
    main()
