
import feedparser
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import hashlib
import os
import json

# List of RSS feeds
rss_feeds = [
    "http://feeds.reuters.com/reuters/topNews",
    "https://www.zerohedge.com/rss.xml",
    "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
    "https://www.aljazeera.com/xml/rss/all.xml",
    "https://feeds.bbci.co.uk/news/world/rss.xml"
]

# Output folder
output_folder = "sue_news_index"
os.makedirs(output_folder, exist_ok=True)

def clean_html(raw_html):
    soup = BeautifulSoup(raw_html, "html.parser")
    return soup.get_text()

def generate_hash(text):
    return hashlib.sha256(text.encode('utf-8')).hexdigest()

def summarize(text, max_words=60):
    words = text.split()
    return " ".join(words[:max_words]) + ("..." if len(words) > max_words else "")

indexed_articles = []
for url in rss_feeds:
    feed = feedparser.parse(url)
    for entry in feed.entries[:5]:
        title = entry.get("title", "")
        link = entry.get("link", "")
        summary_raw = entry.get("summary", "")
        published = entry.get("published", datetime.utcnow().isoformat())

        clean_summary = clean_html(summary_raw)
        short_summary = summarize(clean_summary)

        article_id = generate_hash(title + link)
        index_entry = {
            "id": article_id,
            "title": title,
            "summary": short_summary,
            "source": feed.feed.get("title", "Unknown Source"),
            "link": link,
            "published": published,
            "topics": []
        }

        indexed_articles.append(index_entry)

        with open(os.path.join(output_folder, f"{article_id}.json"), "w") as f:
            json.dump(index_entry, f, indent=2)

print(f"Saved {len(indexed_articles)} articles to '{output_folder}' folder.")
