# ---------------------------------------------------------------------- # 3️⃣ Core extractor # ---------------------------------------------------------------------- def extract_video_info(page_url: str) -> dict: """ Given a full video page URL, return a dict with extracted metadata. Raises RuntimeError on validation / fetch problems. """ # ----- Validate URL ----- parsed = urllib.parse.urlparse(page_url) if parsed.scheme not in ("http", "https"): raise RuntimeError("URL must start with http:// or https://") if parsed.netloc.lower() != BASE_DOMAIN: raise RuntimeError(f"URL must belong to BASE_DOMAIN") # Very simple heuristic that most videoone.com pages contain "/watch/" or similar. if not re.search(r"/(watch|video|v)/", parsed.path, re.IGNORECASE): raise RuntimeError("URL does not look like a video page (missing expected path segment)")
# Deduplicate while preserving order def dedup(seq): seen = set() out = [] for item in seq: if item not in seen: seen.add(item) out.append(item) return out