diff --git a/config/feeds.json b/config/feeds.json new file mode 100644 index 0000000..b7e3776 --- /dev/null +++ b/config/feeds.json @@ -0,0 +1,132 @@ +{ + "Feeds": { + "CBC": [ + { + "Name": "CBC Top Stories", + "Url": "https://www.cbc.ca/cmlink/rss-topstories", + "Category": "National", + "Enabled": true + }, + { + "Name": "CBC World", + "Url": "https://www.cbc.ca/cmlink/rss-world", + "Category": "International", + "Enabled": true + }, + { + "Name": "CBC Politics", + "Url": "https://www.cbc.ca/cmlink/rss-politics", + "Category": "Politics", + "Enabled": true + }, + { + "Name": "CBC Business", + "Url": "https://www.cbc.ca/cmlink/rss-business", + "Category": "Economy", + "Enabled": true + }, + { + "Name": "CBC Health", + "Url": "https://www.cbc.ca/cmlink/rss-health", + "Category": "Health", + "Enabled": true + }, + { + "Name": "CBC Technology & Science", + "Url": "https://www.cbc.ca/cmlink/rss-technology", + "Category": "Technology", + "Enabled": true + }, + { + "Name": "CBC Arts & Entertainment", + "Url": "https://www.cbc.ca/cmlink/rss-arts", + "Category": "Arts", + "Enabled": true + }, + { + "Name": "CBC Sports", + "Url": "https://www.cbc.ca/cmlink/rss-sports", + "Category": "Sports", + "Enabled": true + }, + { + "Name": "CBC Local – Ottawa", + "Url": "https://www.cbc.ca/cmlink/rss-canada-ottawa", + "Category": "Local", + "Region": "Ottawa", + "Enabled": true + }, + { + "Name": "CBC Local – Toronto", + "Url": "https://www.cbc.ca/cmlink/rss-canada-toronto", + "Category": "Local", + "Region": "Toronto", + "Enabled": true + }, + { + "Name": "CBC Local – Montreal", + "Url": "https://www.cbc.ca/cmlink/rss-canada-montreal", + "Category": "Local", + "Region": "Montreal", + "Enabled": true + }, + { + "Name": "CBC Local – Vancouver", + "Url": "https://www.cbc.ca/cmlink/rss-canada-britishcolumbia", + "Category": "Local", + "Region": "Vancouver", + "Enabled": true + }, + { + "Name": "CBC Local – Calgary", + "Url": "https://www.cbc.ca/cmlink/rss-canada-calgary", + "Category": "Local", + "Region": "Calgary", + "Enabled": true + }, + { + "Name": "CBC Local – Edmonton", + "Url": "https://www.cbc.ca/cmlink/rss-canada-edmonton", + "Category": "Local", + "Region": "Edmonton", + "Enabled": true + }, + { + "Name": "CBC Local – Winnipeg", + "Url": "https://www.cbc.ca/cmlink/rss-canada-manitoba", + "Category": "Local", + "Region": "Winnipeg", + "Enabled": true + }, + { + "Name": "CBC Local – Halifax", + "Url": "https://www.cbc.ca/cmlink/rss-canada-novascotia", + "Category": "Local", + "Region": "Halifax", + "Enabled": true + }, + { + "Name": "CBC Local – St. John's", + "Url": "https://www.cbc.ca/cmlink/rss-canada-newfoundland", + "Category": "Local", + "Region": "St. John's", + "Enabled": true + } + ], + "WeatherNetwork": [ + { + "Name": "WeatherNetwork National Weather", + "Url": "", + "Category": "Weather", + "Enabled": false + }, + { + "Name": "WeatherNetwork Regional Feed", + "Url": "", + "Category": "Weather", + "Enabled": false + } + ], + "Other": [] + } +} diff --git a/src/config/feed_loader.py b/src/config/feed_loader.py new file mode 100644 index 0000000..5eb8882 --- /dev/null +++ b/src/config/feed_loader.py @@ -0,0 +1,134 @@ +# ============================================================ +# File: src/config/feed_loader.py +# Description: +# Loads and filters RSS feed definitions from /config/feeds.json. +# Currently defaults to the Ottawa region for stable offline +# development. Region aliases and geolocation support remain +# in place for future activation. +# ============================================================ + +import os +import json + +# ------------------------------------------------------------ +# Region alias table for common Canadian cities/suburbs +# ------------------------------------------------------------ +REGION_ALIASES = { + # Ontario + "cornwall": "Ottawa", + "south glengarry": "Ottawa", + "glengarry": "Ottawa", + "kingston": "Ottawa", + "belleville": "Ottawa", + + # Québec (map suburbs to Montréal) + "montreal": "Montreal", + "laval": "Montreal", + "leval": "Montreal", # typo guard + "terrebonne": "Montreal", + "longueuil": "Montreal", + "brossard": "Montreal", + "repentigny": "Montreal", + "l'île-perrot": "Montreal", + "saint-lambert": "Montreal", + + # British Columbia + "vancouver": "Vancouver", + "burnaby": "Vancouver", + "surrey": "Vancouver", + "richmond": "Vancouver", + + # Alberta + "calgary": "Calgary", + "edmonton": "Edmonton", + + # Manitoba + "winnipeg": "Winnipeg", + + # Nova Scotia + "halifax": "Halifax", + + # Newfoundland and Labrador + "st. john's": "St. John's", + "saint johns": "St. John's", +} + + +class FeedLoader: + """Handles loading and filtering RSS feed definitions.""" + + def __init__(self, config_dir: str = "config", region_override: str | None = None): + self.config_path = os.path.join(config_dir, "feeds.json") + + # For now, IP-based detection is disabled. + # Use region_override if provided, otherwise default to Ottawa. + self.region = region_override or "Ottawa" + + self.all_feeds = {} + self.active_feeds = [] + + self._load_json() + self._filter_active_feeds() + + # ------------------------------------------------------------ + def _load_json(self) -> None: + """Loads feeds.json and parses all sections.""" + if not os.path.exists(self.config_path): + raise FileNotFoundError(f"Missing feeds.json at {self.config_path}") + + with open(self.config_path, "r", encoding="utf-8") as file: + try: + data = json.load(file) + except json.JSONDecodeError as e: + raise ValueError(f"Invalid JSON in feeds.json: {e}") + + if "Feeds" not in data or not isinstance(data["Feeds"], dict): + raise ValueError("feeds.json must contain a 'Feeds' dictionary.") + + self.all_feeds = data["Feeds"] + + # ------------------------------------------------------------ + def _filter_active_feeds(self) -> None: + """Filters feeds for the detected region and enabled status.""" + active = [] + + for provider, feed_list in self.all_feeds.items(): + for feed in feed_list: + if not feed.get("Enabled", False): + continue + + feed_region = feed.get("Region") + if feed_region and feed_region != self.region: + continue # Skip other CBC regions + + active.append({ + "Provider": provider, + "Name": feed.get("Name", "Unnamed Feed"), + "Url": feed.get("Url"), + "Category": feed.get("Category", "General"), + "Region": feed.get("Region", None) + }) + + self.active_feeds = active + print(f"[info] Loaded {len(self.active_feeds)} active feeds for region: {self.region}") + + # ------------------------------------------------------------ + def get_active_feeds(self) -> list[dict]: + """Returns the list of active region-appropriate feeds.""" + return self.active_feeds + + def get_region(self) -> str: + """Returns the currently set region.""" + return self.region + + +# ------------------------------------------------------------ +# Example usage (manual testing) +# ------------------------------------------------------------ +if __name__ == "__main__": + loader = FeedLoader() + feeds = loader.get_active_feeds() + + print(f"Detected Region: {loader.get_region()}") + for feed in feeds: + print(f"- {feed['Name']} ({feed['Url']}) [{feed['Provider']}]") diff --git a/src/rss/rss_feedHandler.py b/src/rss/rss_feedHandler.py new file mode 100644 index 0000000..7bbef6b --- /dev/null +++ b/src/rss/rss_feedHandler.py @@ -0,0 +1,269 @@ +# ============================================================ +# File: src/rss/rss_feedHandler.py +# Description: +# Handles RSS feed retrieval, caching, and validation for +# the Telefact broadcaster. Uses FeedLoader for region-aware +# feed selection and stores cached data under Cache/Feeds/. +# Automatically refreshes feeds only if the cache is older than +# the configured update interval (default: 10 minutes). +# Optimized for CBC RSS: ignores image-only items and non-story types. +# ============================================================ + +import os +import re +import json +import time +import requests +import xml.etree.ElementTree as ET +from html import unescape +from datetime import datetime +from src.config.feed_loader import FeedLoader + + +class RSSFeedHandler: + """Handles downloading, caching, and parsing of RSS feeds.""" + + def __init__( + self, + cache_dir: str = "Cache/Feeds", + config_dir: str = "config", + cache_duration_minutes: int = 10, + story_limit: int = 6, + ): + self.cache_dir = cache_dir + self.cache_duration = cache_duration_minutes * 60 # seconds + self.feed_loader = FeedLoader(config_dir=config_dir) + self.region = self.feed_loader.get_region() + self.story_limit = story_limit + + self._ensure_cache_dirs() + + # ------------------------------------------------------------ + def _ensure_cache_dirs(self) -> None: + """Ensures regional cache directories exist.""" + region_path = os.path.join(self.cache_dir, self.region) + os.makedirs(region_path, exist_ok=True) + + # ------------------------------------------------------------ + def _get_cache_path(self, feed_name: str) -> str: + """Generates path to feed cache file.""" + safe_name = feed_name.replace(" ", "_").replace("/", "_") + return os.path.join(self.cache_dir, self.region, f"{safe_name}.json") + + # ------------------------------------------------------------ + def _is_cache_valid(self, path: str) -> bool: + """Returns True if cache exists and is within valid time.""" + if not os.path.exists(path): + return False + age = time.time() - os.path.getmtime(path) + return age < self.cache_duration + + # ------------------------------------------------------------ + def _fetch_rss(self, url: str) -> str | None: + """Fetches raw RSS XML from a given URL with retry & spoofed UA.""" + headers = { + "User-Agent": ( + "Mozilla/5.0 (X11; Linux x86_64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/121.0 Safari/537.36" + ) + } + + for attempt in range(3): + try: + response = requests.get(url, headers=headers, timeout=20) + if response.status_code == 200: + return response.text + print(f"[warn] RSS fetch failed ({response.status_code}) for {url}") + break # non-200 is not retryable + except requests.Timeout: + print(f"[warn] Timeout {attempt + 1}/3 for {url}") + except requests.RequestException as e: + print(f"[warn] RSS fetch error ({attempt + 1}/3): {e}") + time.sleep(2 ** attempt) # backoff: 1 s, 2 s, 4 s + + return None + + # ------------------------------------------------------------ + def _strip_html(self, text: str) -> str: + """Removes HTML tags and decodes entities.""" + # Remove and and all tags + clean = re.sub(r"]*>", "", text) + clean = re.sub(r"<[^>]+>", "", clean) + clean = unescape(clean) + return clean.strip() + + # ------------------------------------------------------------ + def _parse_rss(self, xml_data: str) -> list[dict]: + """Parses RSS XML into a list of story dictionaries, CBC-optimized.""" + try: + root = ET.fromstring(xml_data) + except ET.ParseError as e: + print(f"[warn] XML parse error: {e}") + return [] + + channel = root.find("channel") + if channel is None: + return [] + + stories = [] + for item in channel.findall("item"): + # --- CBC namespaced attributes --- + cbc_type = "" + deptid = "" + + for child in item: + tag = child.tag.lower() + if tag.endswith("type"): + cbc_type = (child.text or "").strip() + elif tag.endswith("deptid"): + deptid = (child.text or "").strip() + + # Skip if not a story + if cbc_type.lower() not in ("story", ""): + continue + + title = item.findtext("title", "").strip() + link = item.findtext("link", "").strip() + description = item.findtext("description", "").strip() + pub_date = item.findtext("pubDate", "").strip() + + # Remove CDATA wrappers + title = title.replace("", "").strip() + description = description.replace("", "").strip() + + # Strip HTML + clean_text = self._strip_html(description) + + # Skip empty or image-only items + if not clean_text or len(clean_text) < 20: + continue + + # Truncate for Teletext readability + summary = clean_text[:500].rstrip() + + stories.append({ + "Title": title, + "Summary": summary, + "Link": link, + "PubDate": pub_date, + "DeptID": deptid, + "Provider": "CBC", + }) + + if not stories: + print("[debug] No valid story entries found — may be namespace issue.") + else: + print(f"[debug] Parsed {len(stories)} stories successfully.") + + # Sort newest → oldest + stories = self._sort_stories(stories) + + # Limit to N stories + if self.story_limit > 0: + stories = stories[:self.story_limit] + + return stories + + # ------------------------------------------------------------ + def _sort_stories(self, stories: list[dict]) -> list[dict]: + """Sorts stories by publication date, newest first.""" + def parse_date(pubdate: str): + try: + return datetime.strptime(pubdate, "%a, %d %b %Y %H:%M:%S %Z") + except Exception: + return datetime.min + + return sorted(stories, key=lambda s: parse_date(s.get("PubDate", "")), reverse=True) + + # ------------------------------------------------------------ + def _load_cached_feed(self, path: str) -> list[dict]: + """Loads cached JSON feed data if available.""" + try: + with open(path, "r", encoding="utf-8") as file: + return json.load(file) + except Exception as e: + print(f"[warn] Could not load cache: {e}") + return [] + + # ------------------------------------------------------------ + def _save_cache(self, path: str, stories: list[dict]) -> None: + """Saves parsed stories to local cache.""" + try: + with open(path, "w", encoding="utf-8") as file: + json.dump( + {"LastUpdated": datetime.utcnow().isoformat(), "Stories": stories}, + file, + ensure_ascii=False, + indent=2, + ) + except Exception as e: + print(f"[warn] Failed to save cache for {path}: {e}") + + # ------------------------------------------------------------ + def update_feeds(self, force: bool = False) -> None: + """Fetches and caches all active feeds.""" + active_feeds = self.feed_loader.get_active_feeds() + if not active_feeds: + print("[warn] No active feeds to update.") + return + + print(f"[info] Updating {len(active_feeds)} feeds for region: {self.region}") + + for feed in active_feeds: + feed_name = feed["Name"] + feed_url = feed["Url"] + cache_path = self._get_cache_path(feed_name) + + if not force and self._is_cache_valid(cache_path): + print(f"[info] Cache valid for: {feed_name}") + continue + + print(f"[info] Fetching: {feed_name}") + xml_data = self._fetch_rss(feed_url) + if not xml_data: + continue + + stories = self._parse_rss(xml_data) + if not stories: + print(f"[warn] No valid stories found in {feed_name}") + continue + + self._save_cache(cache_path, stories) + print(f"[info] Cached {len(stories)} stories from {feed_name}") + + # ------------------------------------------------------------ + def load_cached_feeds(self) -> dict[str, list[dict]]: + """Loads all cached feeds for the current region.""" + region_path = os.path.join(self.cache_dir, self.region) + if not os.path.exists(region_path): + return {} + + cached_data = {} + for filename in os.listdir(region_path): + if not filename.endswith(".json"): + continue + path = os.path.join(region_path, filename) + feed_name = filename.replace(".json", "").replace("_", " ") + data = self._load_cached_feed(path) + cached_data[feed_name] = data + + return cached_data + + +# ------------------------------------------------------------ +# Example usage (manual test) +# ------------------------------------------------------------ +if __name__ == "__main__": + handler = RSSFeedHandler(cache_duration_minutes=10, story_limit=6) + handler.update_feeds() + + cached = handler.load_cached_feeds() + for feed, data in cached.items(): + print(f"\n=== {feed} ===") + if isinstance(data, dict): + stories = data.get("Stories", []) + else: + stories = data + for story in stories: + print(f"- {story['Title']}")