v0.2.0: Stable RSS feed handler and region-aware feed loader for CBC feeds
This commit is contained in:
parent
211ebfcd01
commit
ddc5576ec1
132
config/feeds.json
Normal file
132
config/feeds.json
Normal file
|
|
@ -0,0 +1,132 @@
|
|||
{
|
||||
"Feeds": {
|
||||
"CBC": [
|
||||
{
|
||||
"Name": "CBC Top Stories",
|
||||
"Url": "https://www.cbc.ca/cmlink/rss-topstories",
|
||||
"Category": "National",
|
||||
"Enabled": true
|
||||
},
|
||||
{
|
||||
"Name": "CBC World",
|
||||
"Url": "https://www.cbc.ca/cmlink/rss-world",
|
||||
"Category": "International",
|
||||
"Enabled": true
|
||||
},
|
||||
{
|
||||
"Name": "CBC Politics",
|
||||
"Url": "https://www.cbc.ca/cmlink/rss-politics",
|
||||
"Category": "Politics",
|
||||
"Enabled": true
|
||||
},
|
||||
{
|
||||
"Name": "CBC Business",
|
||||
"Url": "https://www.cbc.ca/cmlink/rss-business",
|
||||
"Category": "Economy",
|
||||
"Enabled": true
|
||||
},
|
||||
{
|
||||
"Name": "CBC Health",
|
||||
"Url": "https://www.cbc.ca/cmlink/rss-health",
|
||||
"Category": "Health",
|
||||
"Enabled": true
|
||||
},
|
||||
{
|
||||
"Name": "CBC Technology & Science",
|
||||
"Url": "https://www.cbc.ca/cmlink/rss-technology",
|
||||
"Category": "Technology",
|
||||
"Enabled": true
|
||||
},
|
||||
{
|
||||
"Name": "CBC Arts & Entertainment",
|
||||
"Url": "https://www.cbc.ca/cmlink/rss-arts",
|
||||
"Category": "Arts",
|
||||
"Enabled": true
|
||||
},
|
||||
{
|
||||
"Name": "CBC Sports",
|
||||
"Url": "https://www.cbc.ca/cmlink/rss-sports",
|
||||
"Category": "Sports",
|
||||
"Enabled": true
|
||||
},
|
||||
{
|
||||
"Name": "CBC Local – Ottawa",
|
||||
"Url": "https://www.cbc.ca/cmlink/rss-canada-ottawa",
|
||||
"Category": "Local",
|
||||
"Region": "Ottawa",
|
||||
"Enabled": true
|
||||
},
|
||||
{
|
||||
"Name": "CBC Local – Toronto",
|
||||
"Url": "https://www.cbc.ca/cmlink/rss-canada-toronto",
|
||||
"Category": "Local",
|
||||
"Region": "Toronto",
|
||||
"Enabled": true
|
||||
},
|
||||
{
|
||||
"Name": "CBC Local – Montreal",
|
||||
"Url": "https://www.cbc.ca/cmlink/rss-canada-montreal",
|
||||
"Category": "Local",
|
||||
"Region": "Montreal",
|
||||
"Enabled": true
|
||||
},
|
||||
{
|
||||
"Name": "CBC Local – Vancouver",
|
||||
"Url": "https://www.cbc.ca/cmlink/rss-canada-britishcolumbia",
|
||||
"Category": "Local",
|
||||
"Region": "Vancouver",
|
||||
"Enabled": true
|
||||
},
|
||||
{
|
||||
"Name": "CBC Local – Calgary",
|
||||
"Url": "https://www.cbc.ca/cmlink/rss-canada-calgary",
|
||||
"Category": "Local",
|
||||
"Region": "Calgary",
|
||||
"Enabled": true
|
||||
},
|
||||
{
|
||||
"Name": "CBC Local – Edmonton",
|
||||
"Url": "https://www.cbc.ca/cmlink/rss-canada-edmonton",
|
||||
"Category": "Local",
|
||||
"Region": "Edmonton",
|
||||
"Enabled": true
|
||||
},
|
||||
{
|
||||
"Name": "CBC Local – Winnipeg",
|
||||
"Url": "https://www.cbc.ca/cmlink/rss-canada-manitoba",
|
||||
"Category": "Local",
|
||||
"Region": "Winnipeg",
|
||||
"Enabled": true
|
||||
},
|
||||
{
|
||||
"Name": "CBC Local – Halifax",
|
||||
"Url": "https://www.cbc.ca/cmlink/rss-canada-novascotia",
|
||||
"Category": "Local",
|
||||
"Region": "Halifax",
|
||||
"Enabled": true
|
||||
},
|
||||
{
|
||||
"Name": "CBC Local – St. John's",
|
||||
"Url": "https://www.cbc.ca/cmlink/rss-canada-newfoundland",
|
||||
"Category": "Local",
|
||||
"Region": "St. John's",
|
||||
"Enabled": true
|
||||
}
|
||||
],
|
||||
"WeatherNetwork": [
|
||||
{
|
||||
"Name": "WeatherNetwork National Weather",
|
||||
"Url": "",
|
||||
"Category": "Weather",
|
||||
"Enabled": false
|
||||
},
|
||||
{
|
||||
"Name": "WeatherNetwork Regional Feed",
|
||||
"Url": "",
|
||||
"Category": "Weather",
|
||||
"Enabled": false
|
||||
}
|
||||
],
|
||||
"Other": []
|
||||
}
|
||||
}
|
||||
134
src/config/feed_loader.py
Normal file
134
src/config/feed_loader.py
Normal file
|
|
@ -0,0 +1,134 @@
|
|||
# ============================================================
|
||||
# File: src/config/feed_loader.py
|
||||
# Description:
|
||||
# Loads and filters RSS feed definitions from /config/feeds.json.
|
||||
# Currently defaults to the Ottawa region for stable offline
|
||||
# development. Region aliases and geolocation support remain
|
||||
# in place for future activation.
|
||||
# ============================================================
|
||||
|
||||
import os
|
||||
import json
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# Region alias table for common Canadian cities/suburbs
|
||||
# ------------------------------------------------------------
|
||||
REGION_ALIASES = {
|
||||
# Ontario
|
||||
"cornwall": "Ottawa",
|
||||
"south glengarry": "Ottawa",
|
||||
"glengarry": "Ottawa",
|
||||
"kingston": "Ottawa",
|
||||
"belleville": "Ottawa",
|
||||
|
||||
# Québec (map suburbs to Montréal)
|
||||
"montreal": "Montreal",
|
||||
"laval": "Montreal",
|
||||
"leval": "Montreal", # typo guard
|
||||
"terrebonne": "Montreal",
|
||||
"longueuil": "Montreal",
|
||||
"brossard": "Montreal",
|
||||
"repentigny": "Montreal",
|
||||
"l'île-perrot": "Montreal",
|
||||
"saint-lambert": "Montreal",
|
||||
|
||||
# British Columbia
|
||||
"vancouver": "Vancouver",
|
||||
"burnaby": "Vancouver",
|
||||
"surrey": "Vancouver",
|
||||
"richmond": "Vancouver",
|
||||
|
||||
# Alberta
|
||||
"calgary": "Calgary",
|
||||
"edmonton": "Edmonton",
|
||||
|
||||
# Manitoba
|
||||
"winnipeg": "Winnipeg",
|
||||
|
||||
# Nova Scotia
|
||||
"halifax": "Halifax",
|
||||
|
||||
# Newfoundland and Labrador
|
||||
"st. john's": "St. John's",
|
||||
"saint johns": "St. John's",
|
||||
}
|
||||
|
||||
|
||||
class FeedLoader:
|
||||
"""Handles loading and filtering RSS feed definitions."""
|
||||
|
||||
def __init__(self, config_dir: str = "config", region_override: str | None = None):
|
||||
self.config_path = os.path.join(config_dir, "feeds.json")
|
||||
|
||||
# For now, IP-based detection is disabled.
|
||||
# Use region_override if provided, otherwise default to Ottawa.
|
||||
self.region = region_override or "Ottawa"
|
||||
|
||||
self.all_feeds = {}
|
||||
self.active_feeds = []
|
||||
|
||||
self._load_json()
|
||||
self._filter_active_feeds()
|
||||
|
||||
# ------------------------------------------------------------
|
||||
def _load_json(self) -> None:
|
||||
"""Loads feeds.json and parses all sections."""
|
||||
if not os.path.exists(self.config_path):
|
||||
raise FileNotFoundError(f"Missing feeds.json at {self.config_path}")
|
||||
|
||||
with open(self.config_path, "r", encoding="utf-8") as file:
|
||||
try:
|
||||
data = json.load(file)
|
||||
except json.JSONDecodeError as e:
|
||||
raise ValueError(f"Invalid JSON in feeds.json: {e}")
|
||||
|
||||
if "Feeds" not in data or not isinstance(data["Feeds"], dict):
|
||||
raise ValueError("feeds.json must contain a 'Feeds' dictionary.")
|
||||
|
||||
self.all_feeds = data["Feeds"]
|
||||
|
||||
# ------------------------------------------------------------
|
||||
def _filter_active_feeds(self) -> None:
|
||||
"""Filters feeds for the detected region and enabled status."""
|
||||
active = []
|
||||
|
||||
for provider, feed_list in self.all_feeds.items():
|
||||
for feed in feed_list:
|
||||
if not feed.get("Enabled", False):
|
||||
continue
|
||||
|
||||
feed_region = feed.get("Region")
|
||||
if feed_region and feed_region != self.region:
|
||||
continue # Skip other CBC regions
|
||||
|
||||
active.append({
|
||||
"Provider": provider,
|
||||
"Name": feed.get("Name", "Unnamed Feed"),
|
||||
"Url": feed.get("Url"),
|
||||
"Category": feed.get("Category", "General"),
|
||||
"Region": feed.get("Region", None)
|
||||
})
|
||||
|
||||
self.active_feeds = active
|
||||
print(f"[info] Loaded {len(self.active_feeds)} active feeds for region: {self.region}")
|
||||
|
||||
# ------------------------------------------------------------
|
||||
def get_active_feeds(self) -> list[dict]:
|
||||
"""Returns the list of active region-appropriate feeds."""
|
||||
return self.active_feeds
|
||||
|
||||
def get_region(self) -> str:
|
||||
"""Returns the currently set region."""
|
||||
return self.region
|
||||
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# Example usage (manual testing)
|
||||
# ------------------------------------------------------------
|
||||
if __name__ == "__main__":
|
||||
loader = FeedLoader()
|
||||
feeds = loader.get_active_feeds()
|
||||
|
||||
print(f"Detected Region: {loader.get_region()}")
|
||||
for feed in feeds:
|
||||
print(f"- {feed['Name']} ({feed['Url']}) [{feed['Provider']}]")
|
||||
269
src/rss/rss_feedHandler.py
Normal file
269
src/rss/rss_feedHandler.py
Normal file
|
|
@ -0,0 +1,269 @@
|
|||
# ============================================================
|
||||
# File: src/rss/rss_feedHandler.py
|
||||
# Description:
|
||||
# Handles RSS feed retrieval, caching, and validation for
|
||||
# the Telefact broadcaster. Uses FeedLoader for region-aware
|
||||
# feed selection and stores cached data under Cache/Feeds/<Region>.
|
||||
# Automatically refreshes feeds only if the cache is older than
|
||||
# the configured update interval (default: 10 minutes).
|
||||
# Optimized for CBC RSS: ignores image-only items and non-story types.
|
||||
# ============================================================
|
||||
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
import time
|
||||
import requests
|
||||
import xml.etree.ElementTree as ET
|
||||
from html import unescape
|
||||
from datetime import datetime
|
||||
from src.config.feed_loader import FeedLoader
|
||||
|
||||
|
||||
class RSSFeedHandler:
|
||||
"""Handles downloading, caching, and parsing of RSS feeds."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
cache_dir: str = "Cache/Feeds",
|
||||
config_dir: str = "config",
|
||||
cache_duration_minutes: int = 10,
|
||||
story_limit: int = 6,
|
||||
):
|
||||
self.cache_dir = cache_dir
|
||||
self.cache_duration = cache_duration_minutes * 60 # seconds
|
||||
self.feed_loader = FeedLoader(config_dir=config_dir)
|
||||
self.region = self.feed_loader.get_region()
|
||||
self.story_limit = story_limit
|
||||
|
||||
self._ensure_cache_dirs()
|
||||
|
||||
# ------------------------------------------------------------
|
||||
def _ensure_cache_dirs(self) -> None:
|
||||
"""Ensures regional cache directories exist."""
|
||||
region_path = os.path.join(self.cache_dir, self.region)
|
||||
os.makedirs(region_path, exist_ok=True)
|
||||
|
||||
# ------------------------------------------------------------
|
||||
def _get_cache_path(self, feed_name: str) -> str:
|
||||
"""Generates path to feed cache file."""
|
||||
safe_name = feed_name.replace(" ", "_").replace("/", "_")
|
||||
return os.path.join(self.cache_dir, self.region, f"{safe_name}.json")
|
||||
|
||||
# ------------------------------------------------------------
|
||||
def _is_cache_valid(self, path: str) -> bool:
|
||||
"""Returns True if cache exists and is within valid time."""
|
||||
if not os.path.exists(path):
|
||||
return False
|
||||
age = time.time() - os.path.getmtime(path)
|
||||
return age < self.cache_duration
|
||||
|
||||
# ------------------------------------------------------------
|
||||
def _fetch_rss(self, url: str) -> str | None:
|
||||
"""Fetches raw RSS XML from a given URL with retry & spoofed UA."""
|
||||
headers = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (X11; Linux x86_64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/121.0 Safari/537.36"
|
||||
)
|
||||
}
|
||||
|
||||
for attempt in range(3):
|
||||
try:
|
||||
response = requests.get(url, headers=headers, timeout=20)
|
||||
if response.status_code == 200:
|
||||
return response.text
|
||||
print(f"[warn] RSS fetch failed ({response.status_code}) for {url}")
|
||||
break # non-200 is not retryable
|
||||
except requests.Timeout:
|
||||
print(f"[warn] Timeout {attempt + 1}/3 for {url}")
|
||||
except requests.RequestException as e:
|
||||
print(f"[warn] RSS fetch error ({attempt + 1}/3): {e}")
|
||||
time.sleep(2 ** attempt) # backoff: 1 s, 2 s, 4 s
|
||||
|
||||
return None
|
||||
|
||||
# ------------------------------------------------------------
|
||||
def _strip_html(self, text: str) -> str:
|
||||
"""Removes HTML tags and decodes entities."""
|
||||
# Remove <img> and <a> and all tags
|
||||
clean = re.sub(r"<img[^>]*>", "", text)
|
||||
clean = re.sub(r"<[^>]+>", "", clean)
|
||||
clean = unescape(clean)
|
||||
return clean.strip()
|
||||
|
||||
# ------------------------------------------------------------
|
||||
def _parse_rss(self, xml_data: str) -> list[dict]:
|
||||
"""Parses RSS XML into a list of story dictionaries, CBC-optimized."""
|
||||
try:
|
||||
root = ET.fromstring(xml_data)
|
||||
except ET.ParseError as e:
|
||||
print(f"[warn] XML parse error: {e}")
|
||||
return []
|
||||
|
||||
channel = root.find("channel")
|
||||
if channel is None:
|
||||
return []
|
||||
|
||||
stories = []
|
||||
for item in channel.findall("item"):
|
||||
# --- CBC namespaced attributes ---
|
||||
cbc_type = ""
|
||||
deptid = ""
|
||||
|
||||
for child in item:
|
||||
tag = child.tag.lower()
|
||||
if tag.endswith("type"):
|
||||
cbc_type = (child.text or "").strip()
|
||||
elif tag.endswith("deptid"):
|
||||
deptid = (child.text or "").strip()
|
||||
|
||||
# Skip if not a story
|
||||
if cbc_type.lower() not in ("story", ""):
|
||||
continue
|
||||
|
||||
title = item.findtext("title", "").strip()
|
||||
link = item.findtext("link", "").strip()
|
||||
description = item.findtext("description", "").strip()
|
||||
pub_date = item.findtext("pubDate", "").strip()
|
||||
|
||||
# Remove CDATA wrappers
|
||||
title = title.replace("<![CDATA[", "").replace("]]>", "").strip()
|
||||
description = description.replace("<![CDATA[", "").replace("]]>", "").strip()
|
||||
|
||||
# Strip HTML
|
||||
clean_text = self._strip_html(description)
|
||||
|
||||
# Skip empty or image-only items
|
||||
if not clean_text or len(clean_text) < 20:
|
||||
continue
|
||||
|
||||
# Truncate for Teletext readability
|
||||
summary = clean_text[:500].rstrip()
|
||||
|
||||
stories.append({
|
||||
"Title": title,
|
||||
"Summary": summary,
|
||||
"Link": link,
|
||||
"PubDate": pub_date,
|
||||
"DeptID": deptid,
|
||||
"Provider": "CBC",
|
||||
})
|
||||
|
||||
if not stories:
|
||||
print("[debug] No valid <cbc:type>story</cbc:type> entries found — may be namespace issue.")
|
||||
else:
|
||||
print(f"[debug] Parsed {len(stories)} stories successfully.")
|
||||
|
||||
# Sort newest → oldest
|
||||
stories = self._sort_stories(stories)
|
||||
|
||||
# Limit to N stories
|
||||
if self.story_limit > 0:
|
||||
stories = stories[:self.story_limit]
|
||||
|
||||
return stories
|
||||
|
||||
# ------------------------------------------------------------
|
||||
def _sort_stories(self, stories: list[dict]) -> list[dict]:
|
||||
"""Sorts stories by publication date, newest first."""
|
||||
def parse_date(pubdate: str):
|
||||
try:
|
||||
return datetime.strptime(pubdate, "%a, %d %b %Y %H:%M:%S %Z")
|
||||
except Exception:
|
||||
return datetime.min
|
||||
|
||||
return sorted(stories, key=lambda s: parse_date(s.get("PubDate", "")), reverse=True)
|
||||
|
||||
# ------------------------------------------------------------
|
||||
def _load_cached_feed(self, path: str) -> list[dict]:
|
||||
"""Loads cached JSON feed data if available."""
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as file:
|
||||
return json.load(file)
|
||||
except Exception as e:
|
||||
print(f"[warn] Could not load cache: {e}")
|
||||
return []
|
||||
|
||||
# ------------------------------------------------------------
|
||||
def _save_cache(self, path: str, stories: list[dict]) -> None:
|
||||
"""Saves parsed stories to local cache."""
|
||||
try:
|
||||
with open(path, "w", encoding="utf-8") as file:
|
||||
json.dump(
|
||||
{"LastUpdated": datetime.utcnow().isoformat(), "Stories": stories},
|
||||
file,
|
||||
ensure_ascii=False,
|
||||
indent=2,
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"[warn] Failed to save cache for {path}: {e}")
|
||||
|
||||
# ------------------------------------------------------------
|
||||
def update_feeds(self, force: bool = False) -> None:
|
||||
"""Fetches and caches all active feeds."""
|
||||
active_feeds = self.feed_loader.get_active_feeds()
|
||||
if not active_feeds:
|
||||
print("[warn] No active feeds to update.")
|
||||
return
|
||||
|
||||
print(f"[info] Updating {len(active_feeds)} feeds for region: {self.region}")
|
||||
|
||||
for feed in active_feeds:
|
||||
feed_name = feed["Name"]
|
||||
feed_url = feed["Url"]
|
||||
cache_path = self._get_cache_path(feed_name)
|
||||
|
||||
if not force and self._is_cache_valid(cache_path):
|
||||
print(f"[info] Cache valid for: {feed_name}")
|
||||
continue
|
||||
|
||||
print(f"[info] Fetching: {feed_name}")
|
||||
xml_data = self._fetch_rss(feed_url)
|
||||
if not xml_data:
|
||||
continue
|
||||
|
||||
stories = self._parse_rss(xml_data)
|
||||
if not stories:
|
||||
print(f"[warn] No valid stories found in {feed_name}")
|
||||
continue
|
||||
|
||||
self._save_cache(cache_path, stories)
|
||||
print(f"[info] Cached {len(stories)} stories from {feed_name}")
|
||||
|
||||
# ------------------------------------------------------------
|
||||
def load_cached_feeds(self) -> dict[str, list[dict]]:
|
||||
"""Loads all cached feeds for the current region."""
|
||||
region_path = os.path.join(self.cache_dir, self.region)
|
||||
if not os.path.exists(region_path):
|
||||
return {}
|
||||
|
||||
cached_data = {}
|
||||
for filename in os.listdir(region_path):
|
||||
if not filename.endswith(".json"):
|
||||
continue
|
||||
path = os.path.join(region_path, filename)
|
||||
feed_name = filename.replace(".json", "").replace("_", " ")
|
||||
data = self._load_cached_feed(path)
|
||||
cached_data[feed_name] = data
|
||||
|
||||
return cached_data
|
||||
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# Example usage (manual test)
|
||||
# ------------------------------------------------------------
|
||||
if __name__ == "__main__":
|
||||
handler = RSSFeedHandler(cache_duration_minutes=10, story_limit=6)
|
||||
handler.update_feeds()
|
||||
|
||||
cached = handler.load_cached_feeds()
|
||||
for feed, data in cached.items():
|
||||
print(f"\n=== {feed} ===")
|
||||
if isinstance(data, dict):
|
||||
stories = data.get("Stories", [])
|
||||
else:
|
||||
stories = data
|
||||
for story in stories:
|
||||
print(f"- {story['Title']}")
|
||||
Loading…
Reference in New Issue
Block a user