v0.2.0: Stable RSS feed handler and region-aware feed loader for CBC feeds

2025-11-03 22:53:37 -05:00 · 2025-11-03 22:53:37 -05:00 · ddc5576ec1
commit ddc5576ec1
parent 211ebfcd01
3 changed files with 535 additions and 0 deletions
--- a/config/feeds.json
+++ b/config/feeds.json
@ -0,0 +1,132 @@
+{
+  "Feeds": {
+    "CBC": [
+      {
+        "Name": "CBC Top Stories",
+        "Url": "https://www.cbc.ca/cmlink/rss-topstories",
+        "Category": "National",
+        "Enabled": true
+      },
+      {
+        "Name": "CBC World",
+        "Url": "https://www.cbc.ca/cmlink/rss-world",
+        "Category": "International",
+        "Enabled": true
+      },
+      {
+        "Name": "CBC Politics",
+        "Url": "https://www.cbc.ca/cmlink/rss-politics",
+        "Category": "Politics",
+        "Enabled": true
+      },
+      {
+        "Name": "CBC Business",
+        "Url": "https://www.cbc.ca/cmlink/rss-business",
+        "Category": "Economy",
+        "Enabled": true
+      },
+      {
+        "Name": "CBC Health",
+        "Url": "https://www.cbc.ca/cmlink/rss-health",
+        "Category": "Health",
+        "Enabled": true
+      },
+      {
+        "Name": "CBC Technology & Science",
+        "Url": "https://www.cbc.ca/cmlink/rss-technology",
+        "Category": "Technology",
+        "Enabled": true
+      },
+      {
+        "Name": "CBC Arts & Entertainment",
+        "Url": "https://www.cbc.ca/cmlink/rss-arts",
+        "Category": "Arts",
+        "Enabled": true
+      },
+      {
+        "Name": "CBC Sports",
+        "Url": "https://www.cbc.ca/cmlink/rss-sports",
+        "Category": "Sports",
+        "Enabled": true
+      },
+      {
+        "Name": "CBC Local – Ottawa",
+        "Url": "https://www.cbc.ca/cmlink/rss-canada-ottawa",
+        "Category": "Local",
+        "Region": "Ottawa",
+        "Enabled": true
+      },
+      {
+        "Name": "CBC Local – Toronto",
+        "Url": "https://www.cbc.ca/cmlink/rss-canada-toronto",
+        "Category": "Local",
+        "Region": "Toronto",
+        "Enabled": true
+      },
+      {
+        "Name": "CBC Local – Montreal",
+        "Url": "https://www.cbc.ca/cmlink/rss-canada-montreal",
+        "Category": "Local",
+        "Region": "Montreal",
+        "Enabled": true
+      },
+      {
+        "Name": "CBC Local – Vancouver",
+        "Url": "https://www.cbc.ca/cmlink/rss-canada-britishcolumbia",
+        "Category": "Local",
+        "Region": "Vancouver",
+        "Enabled": true
+      },
+      {
+        "Name": "CBC Local – Calgary",
+        "Url": "https://www.cbc.ca/cmlink/rss-canada-calgary",
+        "Category": "Local",
+        "Region": "Calgary",
+        "Enabled": true
+      },
+      {
+        "Name": "CBC Local – Edmonton",
+        "Url": "https://www.cbc.ca/cmlink/rss-canada-edmonton",
+        "Category": "Local",
+        "Region": "Edmonton",
+        "Enabled": true
+      },
+      {
+        "Name": "CBC Local – Winnipeg",
+        "Url": "https://www.cbc.ca/cmlink/rss-canada-manitoba",
+        "Category": "Local",
+        "Region": "Winnipeg",
+        "Enabled": true
+      },
+      {
+        "Name": "CBC Local – Halifax",
+        "Url": "https://www.cbc.ca/cmlink/rss-canada-novascotia",
+        "Category": "Local",
+        "Region": "Halifax",
+        "Enabled": true
+      },
+      {
+        "Name": "CBC Local – St. John's",
+        "Url": "https://www.cbc.ca/cmlink/rss-canada-newfoundland",
+        "Category": "Local",
+        "Region": "St. John's",
+        "Enabled": true
+      }
+    ],
+    "WeatherNetwork": [
+      {
+        "Name": "WeatherNetwork National Weather",
+        "Url": "",
+        "Category": "Weather",
+        "Enabled": false
+      },
+      {
+        "Name": "WeatherNetwork Regional Feed",
+        "Url": "",
+        "Category": "Weather",
+        "Enabled": false
+      }
+    ],
+    "Other": []
+  }
+}
--- a/src/config/feed_loader.py
+++ b/src/config/feed_loader.py
@ -0,0 +1,134 @@
+# ============================================================
+# File: src/config/feed_loader.py
+# Description:
+#   Loads and filters RSS feed definitions from /config/feeds.json.
+#   Currently defaults to the Ottawa region for stable offline
+#   development. Region aliases and geolocation support remain
+#   in place for future activation.
+# ============================================================
+
+import os
+import json
+
+# ------------------------------------------------------------
+# Region alias table for common Canadian cities/suburbs
+# ------------------------------------------------------------
+REGION_ALIASES = {
+    # Ontario
+    "cornwall": "Ottawa",
+    "south glengarry": "Ottawa",
+    "glengarry": "Ottawa",
+    "kingston": "Ottawa",
+    "belleville": "Ottawa",
+
+    # Québec (map suburbs to Montréal)
+    "montreal": "Montreal",
+    "laval": "Montreal",
+    "leval": "Montreal",  # typo guard
+    "terrebonne": "Montreal",
+    "longueuil": "Montreal",
+    "brossard": "Montreal",
+    "repentigny": "Montreal",
+    "l'île-perrot": "Montreal",
+    "saint-lambert": "Montreal",
+
+    # British Columbia
+    "vancouver": "Vancouver",
+    "burnaby": "Vancouver",
+    "surrey": "Vancouver",
+    "richmond": "Vancouver",
+
+    # Alberta
+    "calgary": "Calgary",
+    "edmonton": "Edmonton",
+
+    # Manitoba
+    "winnipeg": "Winnipeg",
+
+    # Nova Scotia
+    "halifax": "Halifax",
+
+    # Newfoundland and Labrador
+    "st. john's": "St. John's",
+    "saint johns": "St. John's",
+}
+
+
+class FeedLoader:
+    """Handles loading and filtering RSS feed definitions."""
+
+    def __init__(self, config_dir: str = "config", region_override: str | None = None):
+        self.config_path = os.path.join(config_dir, "feeds.json")
+
+        # For now, IP-based detection is disabled.
+        # Use region_override if provided, otherwise default to Ottawa.
+        self.region = region_override or "Ottawa"
+
+        self.all_feeds = {}
+        self.active_feeds = []
+
+        self._load_json()
+        self._filter_active_feeds()
+
+    # ------------------------------------------------------------
+    def _load_json(self) -> None:
+        """Loads feeds.json and parses all sections."""
+        if not os.path.exists(self.config_path):
+            raise FileNotFoundError(f"Missing feeds.json at {self.config_path}")
+
+        with open(self.config_path, "r", encoding="utf-8") as file:
+            try:
+                data = json.load(file)
+            except json.JSONDecodeError as e:
+                raise ValueError(f"Invalid JSON in feeds.json: {e}")
+
+        if "Feeds" not in data or not isinstance(data["Feeds"], dict):
+            raise ValueError("feeds.json must contain a 'Feeds' dictionary.")
+
+        self.all_feeds = data["Feeds"]
+
+    # ------------------------------------------------------------
+    def _filter_active_feeds(self) -> None:
+        """Filters feeds for the detected region and enabled status."""
+        active = []
+
+        for provider, feed_list in self.all_feeds.items():
+            for feed in feed_list:
+                if not feed.get("Enabled", False):
+                    continue
+
+                feed_region = feed.get("Region")
+                if feed_region and feed_region != self.region:
+                    continue  # Skip other CBC regions
+
+                active.append({
+                    "Provider": provider,
+                    "Name": feed.get("Name", "Unnamed Feed"),
+                    "Url": feed.get("Url"),
+                    "Category": feed.get("Category", "General"),
+                    "Region": feed.get("Region", None)
+                })
+
+        self.active_feeds = active
+        print(f"[info] Loaded {len(self.active_feeds)} active feeds for region: {self.region}")
+
+    # ------------------------------------------------------------
+    def get_active_feeds(self) -> list[dict]:
+        """Returns the list of active region-appropriate feeds."""
+        return self.active_feeds
+
+    def get_region(self) -> str:
+        """Returns the currently set region."""
+        return self.region
+
+
+# ------------------------------------------------------------
+# Example usage (manual testing)
+# ------------------------------------------------------------
+if __name__ == "__main__":
+    loader = FeedLoader()
+    feeds = loader.get_active_feeds()
+
+    print(f"Detected Region: {loader.get_region()}")
+    for feed in feeds:
+        print(f"- {feed['Name']} ({feed['Url']}) [{feed['Provider']}]")
--- a/src/rss/rss_feedHandler.py
+++ b/src/rss/rss_feedHandler.py
@ -0,0 +1,269 @@
+# ============================================================
+# File: src/rss/rss_feedHandler.py
+# Description:
+#   Handles RSS feed retrieval, caching, and validation for
+#   the Telefact broadcaster. Uses FeedLoader for region-aware
+#   feed selection and stores cached data under Cache/Feeds/<Region>.
+#   Automatically refreshes feeds only if the cache is older than
+#   the configured update interval (default: 10 minutes).
+#   Optimized for CBC RSS: ignores image-only items and non-story types.
+# ============================================================
+
+import os
+import re
+import json
+import time
+import requests
+import xml.etree.ElementTree as ET
+from html import unescape
+from datetime import datetime
+from src.config.feed_loader import FeedLoader
+
+
+class RSSFeedHandler:
+    """Handles downloading, caching, and parsing of RSS feeds."""
+
+    def __init__(
+        self,
+        cache_dir: str = "Cache/Feeds",
+        config_dir: str = "config",
+        cache_duration_minutes: int = 10,
+        story_limit: int = 6,
+    ):
+        self.cache_dir = cache_dir
+        self.cache_duration = cache_duration_minutes * 60  # seconds
+        self.feed_loader = FeedLoader(config_dir=config_dir)
+        self.region = self.feed_loader.get_region()
+        self.story_limit = story_limit
+
+        self._ensure_cache_dirs()
+
+    # ------------------------------------------------------------
+    def _ensure_cache_dirs(self) -> None:
+        """Ensures regional cache directories exist."""
+        region_path = os.path.join(self.cache_dir, self.region)
+        os.makedirs(region_path, exist_ok=True)
+
+    # ------------------------------------------------------------
+    def _get_cache_path(self, feed_name: str) -> str:
+        """Generates path to feed cache file."""
+        safe_name = feed_name.replace(" ", "_").replace("/", "_")
+        return os.path.join(self.cache_dir, self.region, f"{safe_name}.json")
+
+    # ------------------------------------------------------------
+    def _is_cache_valid(self, path: str) -> bool:
+        """Returns True if cache exists and is within valid time."""
+        if not os.path.exists(path):
+            return False
+        age = time.time() - os.path.getmtime(path)
+        return age < self.cache_duration
+
+    # ------------------------------------------------------------
+    def _fetch_rss(self, url: str) -> str | None:
+        """Fetches raw RSS XML from a given URL with retry & spoofed UA."""
+        headers = {
+            "User-Agent": (
+                "Mozilla/5.0 (X11; Linux x86_64) "
+                "AppleWebKit/537.36 (KHTML, like Gecko) "
+                "Chrome/121.0 Safari/537.36"
+            )
+        }
+
+        for attempt in range(3):
+            try:
+                response = requests.get(url, headers=headers, timeout=20)
+                if response.status_code == 200:
+                    return response.text
+                print(f"[warn] RSS fetch failed ({response.status_code}) for {url}")
+                break  # non-200 is not retryable
+            except requests.Timeout:
+                print(f"[warn] Timeout {attempt + 1}/3 for {url}")
+            except requests.RequestException as e:
+                print(f"[warn] RSS fetch error ({attempt + 1}/3): {e}")
+            time.sleep(2 ** attempt)  # backoff: 1 s, 2 s, 4 s
+
+        return None
+
+    # ------------------------------------------------------------
+    def _strip_html(self, text: str) -> str:
+        """Removes HTML tags and decodes entities."""
+        # Remove <img> and <a> and all tags
+        clean = re.sub(r"<img[^>]*>", "", text)
+        clean = re.sub(r"<[^>]+>", "", clean)
+        clean = unescape(clean)
+        return clean.strip()
+
+    # ------------------------------------------------------------
+    def _parse_rss(self, xml_data: str) -> list[dict]:
+        """Parses RSS XML into a list of story dictionaries, CBC-optimized."""
+        try:
+            root = ET.fromstring(xml_data)
+        except ET.ParseError as e:
+            print(f"[warn] XML parse error: {e}")
+            return []
+
+        channel = root.find("channel")
+        if channel is None:
+            return []
+
+        stories = []
+        for item in channel.findall("item"):
+            # --- CBC namespaced attributes ---
+            cbc_type = ""
+            deptid = ""
+
+            for child in item:
+                tag = child.tag.lower()
+                if tag.endswith("type"):
+                    cbc_type = (child.text or "").strip()
+                elif tag.endswith("deptid"):
+                    deptid = (child.text or "").strip()
+
+            # Skip if not a story
+            if cbc_type.lower() not in ("story", ""):
+                continue
+
+            title = item.findtext("title", "").strip()
+            link = item.findtext("link", "").strip()
+            description = item.findtext("description", "").strip()
+            pub_date = item.findtext("pubDate", "").strip()
+
+            # Remove CDATA wrappers
+            title = title.replace("<![CDATA[", "").replace("]]>", "").strip()
+            description = description.replace("<![CDATA[", "").replace("]]>", "").strip()
+
+            # Strip HTML
+            clean_text = self._strip_html(description)
+
+            # Skip empty or image-only items
+            if not clean_text or len(clean_text) < 20:
+                continue
+
+            # Truncate for Teletext readability
+            summary = clean_text[:500].rstrip()
+
+            stories.append({
+                "Title": title,
+                "Summary": summary,
+                "Link": link,
+                "PubDate": pub_date,
+                "DeptID": deptid,
+                "Provider": "CBC",
+            })
+
+        if not stories:
+            print("[debug] No valid <cbc:type>story</cbc:type> entries found — may be namespace issue.")
+        else:
+            print(f"[debug] Parsed {len(stories)} stories successfully.")
+
+        # Sort newest → oldest
+        stories = self._sort_stories(stories)
+
+        # Limit to N stories
+        if self.story_limit > 0:
+            stories = stories[:self.story_limit]
+
+        return stories
+
+    # ------------------------------------------------------------
+    def _sort_stories(self, stories: list[dict]) -> list[dict]:
+        """Sorts stories by publication date, newest first."""
+        def parse_date(pubdate: str):
+            try:
+                return datetime.strptime(pubdate, "%a, %d %b %Y %H:%M:%S %Z")
+            except Exception:
+                return datetime.min
+
+        return sorted(stories, key=lambda s: parse_date(s.get("PubDate", "")), reverse=True)
+
+    # ------------------------------------------------------------
+    def _load_cached_feed(self, path: str) -> list[dict]:
+        """Loads cached JSON feed data if available."""
+        try:
+            with open(path, "r", encoding="utf-8") as file:
+                return json.load(file)
+        except Exception as e:
+            print(f"[warn] Could not load cache: {e}")
+            return []
+
+    # ------------------------------------------------------------
+    def _save_cache(self, path: str, stories: list[dict]) -> None:
+        """Saves parsed stories to local cache."""
+        try:
+            with open(path, "w", encoding="utf-8") as file:
+                json.dump(
+                    {"LastUpdated": datetime.utcnow().isoformat(), "Stories": stories},
+                    file,
+                    ensure_ascii=False,
+                    indent=2,
+                )
+        except Exception as e:
+            print(f"[warn] Failed to save cache for {path}: {e}")
+
+    # ------------------------------------------------------------
+    def update_feeds(self, force: bool = False) -> None:
+        """Fetches and caches all active feeds."""
+        active_feeds = self.feed_loader.get_active_feeds()
+        if not active_feeds:
+            print("[warn] No active feeds to update.")
+            return
+
+        print(f"[info] Updating {len(active_feeds)} feeds for region: {self.region}")
+
+        for feed in active_feeds:
+            feed_name = feed["Name"]
+            feed_url = feed["Url"]
+            cache_path = self._get_cache_path(feed_name)
+
+            if not force and self._is_cache_valid(cache_path):
+                print(f"[info] Cache valid for: {feed_name}")
+                continue
+
+            print(f"[info] Fetching: {feed_name}")
+            xml_data = self._fetch_rss(feed_url)
+            if not xml_data:
+                continue
+
+            stories = self._parse_rss(xml_data)
+            if not stories:
+                print(f"[warn] No valid stories found in {feed_name}")
+                continue
+
+            self._save_cache(cache_path, stories)
+            print(f"[info] Cached {len(stories)} stories from {feed_name}")
+
+    # ------------------------------------------------------------
+    def load_cached_feeds(self) -> dict[str, list[dict]]:
+        """Loads all cached feeds for the current region."""
+        region_path = os.path.join(self.cache_dir, self.region)
+        if not os.path.exists(region_path):
+            return {}
+
+        cached_data = {}
+        for filename in os.listdir(region_path):
+            if not filename.endswith(".json"):
+                continue
+            path = os.path.join(region_path, filename)
+            feed_name = filename.replace(".json", "").replace("_", " ")
+            data = self._load_cached_feed(path)
+            cached_data[feed_name] = data
+
+        return cached_data
+
+
+# ------------------------------------------------------------
+# Example usage (manual test)
+# ------------------------------------------------------------
+if __name__ == "__main__":
+    handler = RSSFeedHandler(cache_duration_minutes=10, story_limit=6)
+    handler.update_feeds()
+
+    cached = handler.load_cached_feeds()
+    for feed, data in cached.items():
+        print(f"\n=== {feed} ===")
+        if isinstance(data, dict):
+            stories = data.get("Stories", [])
+        else:
+            stories = data
+        for story in stories:
+            print(f"- {story['Title']}")