v0.2.0: Stable RSS feed handler and region-aware feed loader for CBC feeds

This commit is contained in:
Stu Leak 2025-11-03 22:53:37 -05:00
parent 211ebfcd01
commit ddc5576ec1
3 changed files with 535 additions and 0 deletions

132
config/feeds.json Normal file
View File

@ -0,0 +1,132 @@
{
"Feeds": {
"CBC": [
{
"Name": "CBC Top Stories",
"Url": "https://www.cbc.ca/cmlink/rss-topstories",
"Category": "National",
"Enabled": true
},
{
"Name": "CBC World",
"Url": "https://www.cbc.ca/cmlink/rss-world",
"Category": "International",
"Enabled": true
},
{
"Name": "CBC Politics",
"Url": "https://www.cbc.ca/cmlink/rss-politics",
"Category": "Politics",
"Enabled": true
},
{
"Name": "CBC Business",
"Url": "https://www.cbc.ca/cmlink/rss-business",
"Category": "Economy",
"Enabled": true
},
{
"Name": "CBC Health",
"Url": "https://www.cbc.ca/cmlink/rss-health",
"Category": "Health",
"Enabled": true
},
{
"Name": "CBC Technology & Science",
"Url": "https://www.cbc.ca/cmlink/rss-technology",
"Category": "Technology",
"Enabled": true
},
{
"Name": "CBC Arts & Entertainment",
"Url": "https://www.cbc.ca/cmlink/rss-arts",
"Category": "Arts",
"Enabled": true
},
{
"Name": "CBC Sports",
"Url": "https://www.cbc.ca/cmlink/rss-sports",
"Category": "Sports",
"Enabled": true
},
{
"Name": "CBC Local Ottawa",
"Url": "https://www.cbc.ca/cmlink/rss-canada-ottawa",
"Category": "Local",
"Region": "Ottawa",
"Enabled": true
},
{
"Name": "CBC Local Toronto",
"Url": "https://www.cbc.ca/cmlink/rss-canada-toronto",
"Category": "Local",
"Region": "Toronto",
"Enabled": true
},
{
"Name": "CBC Local Montreal",
"Url": "https://www.cbc.ca/cmlink/rss-canada-montreal",
"Category": "Local",
"Region": "Montreal",
"Enabled": true
},
{
"Name": "CBC Local Vancouver",
"Url": "https://www.cbc.ca/cmlink/rss-canada-britishcolumbia",
"Category": "Local",
"Region": "Vancouver",
"Enabled": true
},
{
"Name": "CBC Local Calgary",
"Url": "https://www.cbc.ca/cmlink/rss-canada-calgary",
"Category": "Local",
"Region": "Calgary",
"Enabled": true
},
{
"Name": "CBC Local Edmonton",
"Url": "https://www.cbc.ca/cmlink/rss-canada-edmonton",
"Category": "Local",
"Region": "Edmonton",
"Enabled": true
},
{
"Name": "CBC Local Winnipeg",
"Url": "https://www.cbc.ca/cmlink/rss-canada-manitoba",
"Category": "Local",
"Region": "Winnipeg",
"Enabled": true
},
{
"Name": "CBC Local Halifax",
"Url": "https://www.cbc.ca/cmlink/rss-canada-novascotia",
"Category": "Local",
"Region": "Halifax",
"Enabled": true
},
{
"Name": "CBC Local St. John's",
"Url": "https://www.cbc.ca/cmlink/rss-canada-newfoundland",
"Category": "Local",
"Region": "St. John's",
"Enabled": true
}
],
"WeatherNetwork": [
{
"Name": "WeatherNetwork National Weather",
"Url": "",
"Category": "Weather",
"Enabled": false
},
{
"Name": "WeatherNetwork Regional Feed",
"Url": "",
"Category": "Weather",
"Enabled": false
}
],
"Other": []
}
}

134
src/config/feed_loader.py Normal file
View File

@ -0,0 +1,134 @@
# ============================================================
# File: src/config/feed_loader.py
# Description:
# Loads and filters RSS feed definitions from /config/feeds.json.
# Currently defaults to the Ottawa region for stable offline
# development. Region aliases and geolocation support remain
# in place for future activation.
# ============================================================
import os
import json
# ------------------------------------------------------------
# Region alias table for common Canadian cities/suburbs
# ------------------------------------------------------------
REGION_ALIASES = {
# Ontario
"cornwall": "Ottawa",
"south glengarry": "Ottawa",
"glengarry": "Ottawa",
"kingston": "Ottawa",
"belleville": "Ottawa",
# Québec (map suburbs to Montréal)
"montreal": "Montreal",
"laval": "Montreal",
"leval": "Montreal", # typo guard
"terrebonne": "Montreal",
"longueuil": "Montreal",
"brossard": "Montreal",
"repentigny": "Montreal",
"l'île-perrot": "Montreal",
"saint-lambert": "Montreal",
# British Columbia
"vancouver": "Vancouver",
"burnaby": "Vancouver",
"surrey": "Vancouver",
"richmond": "Vancouver",
# Alberta
"calgary": "Calgary",
"edmonton": "Edmonton",
# Manitoba
"winnipeg": "Winnipeg",
# Nova Scotia
"halifax": "Halifax",
# Newfoundland and Labrador
"st. john's": "St. John's",
"saint johns": "St. John's",
}
class FeedLoader:
"""Handles loading and filtering RSS feed definitions."""
def __init__(self, config_dir: str = "config", region_override: str | None = None):
self.config_path = os.path.join(config_dir, "feeds.json")
# For now, IP-based detection is disabled.
# Use region_override if provided, otherwise default to Ottawa.
self.region = region_override or "Ottawa"
self.all_feeds = {}
self.active_feeds = []
self._load_json()
self._filter_active_feeds()
# ------------------------------------------------------------
def _load_json(self) -> None:
"""Loads feeds.json and parses all sections."""
if not os.path.exists(self.config_path):
raise FileNotFoundError(f"Missing feeds.json at {self.config_path}")
with open(self.config_path, "r", encoding="utf-8") as file:
try:
data = json.load(file)
except json.JSONDecodeError as e:
raise ValueError(f"Invalid JSON in feeds.json: {e}")
if "Feeds" not in data or not isinstance(data["Feeds"], dict):
raise ValueError("feeds.json must contain a 'Feeds' dictionary.")
self.all_feeds = data["Feeds"]
# ------------------------------------------------------------
def _filter_active_feeds(self) -> None:
"""Filters feeds for the detected region and enabled status."""
active = []
for provider, feed_list in self.all_feeds.items():
for feed in feed_list:
if not feed.get("Enabled", False):
continue
feed_region = feed.get("Region")
if feed_region and feed_region != self.region:
continue # Skip other CBC regions
active.append({
"Provider": provider,
"Name": feed.get("Name", "Unnamed Feed"),
"Url": feed.get("Url"),
"Category": feed.get("Category", "General"),
"Region": feed.get("Region", None)
})
self.active_feeds = active
print(f"[info] Loaded {len(self.active_feeds)} active feeds for region: {self.region}")
# ------------------------------------------------------------
def get_active_feeds(self) -> list[dict]:
"""Returns the list of active region-appropriate feeds."""
return self.active_feeds
def get_region(self) -> str:
"""Returns the currently set region."""
return self.region
# ------------------------------------------------------------
# Example usage (manual testing)
# ------------------------------------------------------------
if __name__ == "__main__":
loader = FeedLoader()
feeds = loader.get_active_feeds()
print(f"Detected Region: {loader.get_region()}")
for feed in feeds:
print(f"- {feed['Name']} ({feed['Url']}) [{feed['Provider']}]")

269
src/rss/rss_feedHandler.py Normal file
View File

@ -0,0 +1,269 @@
# ============================================================
# File: src/rss/rss_feedHandler.py
# Description:
# Handles RSS feed retrieval, caching, and validation for
# the Telefact broadcaster. Uses FeedLoader for region-aware
# feed selection and stores cached data under Cache/Feeds/<Region>.
# Automatically refreshes feeds only if the cache is older than
# the configured update interval (default: 10 minutes).
# Optimized for CBC RSS: ignores image-only items and non-story types.
# ============================================================
import os
import re
import json
import time
import requests
import xml.etree.ElementTree as ET
from html import unescape
from datetime import datetime
from src.config.feed_loader import FeedLoader
class RSSFeedHandler:
"""Handles downloading, caching, and parsing of RSS feeds."""
def __init__(
self,
cache_dir: str = "Cache/Feeds",
config_dir: str = "config",
cache_duration_minutes: int = 10,
story_limit: int = 6,
):
self.cache_dir = cache_dir
self.cache_duration = cache_duration_minutes * 60 # seconds
self.feed_loader = FeedLoader(config_dir=config_dir)
self.region = self.feed_loader.get_region()
self.story_limit = story_limit
self._ensure_cache_dirs()
# ------------------------------------------------------------
def _ensure_cache_dirs(self) -> None:
"""Ensures regional cache directories exist."""
region_path = os.path.join(self.cache_dir, self.region)
os.makedirs(region_path, exist_ok=True)
# ------------------------------------------------------------
def _get_cache_path(self, feed_name: str) -> str:
"""Generates path to feed cache file."""
safe_name = feed_name.replace(" ", "_").replace("/", "_")
return os.path.join(self.cache_dir, self.region, f"{safe_name}.json")
# ------------------------------------------------------------
def _is_cache_valid(self, path: str) -> bool:
"""Returns True if cache exists and is within valid time."""
if not os.path.exists(path):
return False
age = time.time() - os.path.getmtime(path)
return age < self.cache_duration
# ------------------------------------------------------------
def _fetch_rss(self, url: str) -> str | None:
"""Fetches raw RSS XML from a given URL with retry & spoofed UA."""
headers = {
"User-Agent": (
"Mozilla/5.0 (X11; Linux x86_64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/121.0 Safari/537.36"
)
}
for attempt in range(3):
try:
response = requests.get(url, headers=headers, timeout=20)
if response.status_code == 200:
return response.text
print(f"[warn] RSS fetch failed ({response.status_code}) for {url}")
break # non-200 is not retryable
except requests.Timeout:
print(f"[warn] Timeout {attempt + 1}/3 for {url}")
except requests.RequestException as e:
print(f"[warn] RSS fetch error ({attempt + 1}/3): {e}")
time.sleep(2 ** attempt) # backoff: 1 s, 2 s, 4 s
return None
# ------------------------------------------------------------
def _strip_html(self, text: str) -> str:
"""Removes HTML tags and decodes entities."""
# Remove <img> and <a> and all tags
clean = re.sub(r"<img[^>]*>", "", text)
clean = re.sub(r"<[^>]+>", "", clean)
clean = unescape(clean)
return clean.strip()
# ------------------------------------------------------------
def _parse_rss(self, xml_data: str) -> list[dict]:
"""Parses RSS XML into a list of story dictionaries, CBC-optimized."""
try:
root = ET.fromstring(xml_data)
except ET.ParseError as e:
print(f"[warn] XML parse error: {e}")
return []
channel = root.find("channel")
if channel is None:
return []
stories = []
for item in channel.findall("item"):
# --- CBC namespaced attributes ---
cbc_type = ""
deptid = ""
for child in item:
tag = child.tag.lower()
if tag.endswith("type"):
cbc_type = (child.text or "").strip()
elif tag.endswith("deptid"):
deptid = (child.text or "").strip()
# Skip if not a story
if cbc_type.lower() not in ("story", ""):
continue
title = item.findtext("title", "").strip()
link = item.findtext("link", "").strip()
description = item.findtext("description", "").strip()
pub_date = item.findtext("pubDate", "").strip()
# Remove CDATA wrappers
title = title.replace("<![CDATA[", "").replace("]]>", "").strip()
description = description.replace("<![CDATA[", "").replace("]]>", "").strip()
# Strip HTML
clean_text = self._strip_html(description)
# Skip empty or image-only items
if not clean_text or len(clean_text) < 20:
continue
# Truncate for Teletext readability
summary = clean_text[:500].rstrip()
stories.append({
"Title": title,
"Summary": summary,
"Link": link,
"PubDate": pub_date,
"DeptID": deptid,
"Provider": "CBC",
})
if not stories:
print("[debug] No valid <cbc:type>story</cbc:type> entries found — may be namespace issue.")
else:
print(f"[debug] Parsed {len(stories)} stories successfully.")
# Sort newest → oldest
stories = self._sort_stories(stories)
# Limit to N stories
if self.story_limit > 0:
stories = stories[:self.story_limit]
return stories
# ------------------------------------------------------------
def _sort_stories(self, stories: list[dict]) -> list[dict]:
"""Sorts stories by publication date, newest first."""
def parse_date(pubdate: str):
try:
return datetime.strptime(pubdate, "%a, %d %b %Y %H:%M:%S %Z")
except Exception:
return datetime.min
return sorted(stories, key=lambda s: parse_date(s.get("PubDate", "")), reverse=True)
# ------------------------------------------------------------
def _load_cached_feed(self, path: str) -> list[dict]:
"""Loads cached JSON feed data if available."""
try:
with open(path, "r", encoding="utf-8") as file:
return json.load(file)
except Exception as e:
print(f"[warn] Could not load cache: {e}")
return []
# ------------------------------------------------------------
def _save_cache(self, path: str, stories: list[dict]) -> None:
"""Saves parsed stories to local cache."""
try:
with open(path, "w", encoding="utf-8") as file:
json.dump(
{"LastUpdated": datetime.utcnow().isoformat(), "Stories": stories},
file,
ensure_ascii=False,
indent=2,
)
except Exception as e:
print(f"[warn] Failed to save cache for {path}: {e}")
# ------------------------------------------------------------
def update_feeds(self, force: bool = False) -> None:
"""Fetches and caches all active feeds."""
active_feeds = self.feed_loader.get_active_feeds()
if not active_feeds:
print("[warn] No active feeds to update.")
return
print(f"[info] Updating {len(active_feeds)} feeds for region: {self.region}")
for feed in active_feeds:
feed_name = feed["Name"]
feed_url = feed["Url"]
cache_path = self._get_cache_path(feed_name)
if not force and self._is_cache_valid(cache_path):
print(f"[info] Cache valid for: {feed_name}")
continue
print(f"[info] Fetching: {feed_name}")
xml_data = self._fetch_rss(feed_url)
if not xml_data:
continue
stories = self._parse_rss(xml_data)
if not stories:
print(f"[warn] No valid stories found in {feed_name}")
continue
self._save_cache(cache_path, stories)
print(f"[info] Cached {len(stories)} stories from {feed_name}")
# ------------------------------------------------------------
def load_cached_feeds(self) -> dict[str, list[dict]]:
"""Loads all cached feeds for the current region."""
region_path = os.path.join(self.cache_dir, self.region)
if not os.path.exists(region_path):
return {}
cached_data = {}
for filename in os.listdir(region_path):
if not filename.endswith(".json"):
continue
path = os.path.join(region_path, filename)
feed_name = filename.replace(".json", "").replace("_", " ")
data = self._load_cached_feed(path)
cached_data[feed_name] = data
return cached_data
# ------------------------------------------------------------
# Example usage (manual test)
# ------------------------------------------------------------
if __name__ == "__main__":
handler = RSSFeedHandler(cache_duration_minutes=10, story_limit=6)
handler.update_feeds()
cached = handler.load_cached_feeds()
for feed, data in cached.items():
print(f"\n=== {feed} ===")
if isinstance(data, dict):
stories = data.get("Stories", [])
else:
stories = data
for story in stories:
print(f"- {story['Title']}")