Spaces:
Sleeping
Sleeping
| """Gradio Space: YouTube topic -> captioned .docx tutorial (API-based acquisition). | |
| Pipeline: | |
| search top videos -> rank by YouTube Data API comment sentiment -> transcript via | |
| youtube-transcript-api -> DeepSeek-V3 tutorial -> real screenshots via yt-dlp stream URL | |
| + ffmpeg -ss (weighted timestamps) -> VLM captions -> .docx. | |
| No video download and no cookies/proxy/PO-token UI. A thin fallback remains via the | |
| optional Space secrets YT_COOKIES / YT_PROXY (used only to resolve the stream URL and the | |
| transcript when the Space's datacenter IP is blocked). | |
| """ | |
| from __future__ import annotations | |
| import base64 | |
| import binascii | |
| import os | |
| import re | |
| import shutil | |
| import tempfile | |
| import time | |
| def _install_proxy_ca() -> None: | |
| """Trust a self-signed proxy CA (secret ``YT_PROXY_CA``) alongside the system | |
| roots, so an HTTPS (TLS-wrapped) ``YT_PROXY`` validates. TLS-wrapping hides the | |
| target host (``CONNECT www.youtube.com``) from the Space's egress DPI, which | |
| otherwise resets YouTube-bound connections. Covers requests / youtube-transcript-api | |
| (``REQUESTS_CA_BUNDLE``) and stdlib ssl / yt-dlp (``SSL_CERT_FILE``).""" | |
| ca = os.environ.get("YT_PROXY_CA", "").strip() | |
| if not ca: | |
| return | |
| try: | |
| import certifi | |
| bundle = os.path.join(tempfile.gettempdir(), "yt_proxy_ca_bundle.pem") | |
| with open(certifi.where(), encoding="utf-8") as fh: | |
| roots = fh.read() | |
| with open(bundle, "w", encoding="utf-8") as fh: | |
| fh.write(roots.rstrip() + "\n" + ca + "\n") | |
| os.environ["REQUESTS_CA_BUNDLE"] = bundle # requests / youtube-transcript-api | |
| os.environ["SSL_CERT_FILE"] = bundle # stdlib ssl + yt-dlp (see frames.py) | |
| except Exception: | |
| pass | |
| _install_proxy_ca() | |
| import gradio as gr | |
| from pipeline import ( | |
| captions as captions_mod, | |
| docx_builder, | |
| frames as frames_mod, | |
| search as search_mod, | |
| sentiment as sentiment_mod, | |
| transcribe as transcribe_mod, | |
| tutorial as tutorial_mod, | |
| ) | |
| LLM_CHOICES = [ | |
| "deepseek-ai/DeepSeek-V3", | |
| "meta-llama/Llama-3.3-70B-Instruct", | |
| "openai/gpt-oss-120b", | |
| ] | |
| VLM_CHOICES = [ | |
| "Qwen/Qwen2.5-VL-72B-Instruct", | |
| "Qwen/Qwen2.5-VL-7B-Instruct", | |
| "meta-llama/Llama-3.2-90B-Vision-Instruct", | |
| ] | |
| # --------------------------------------------------------------- thin access fallback | |
| def _looks_like_netscape(text: str) -> bool: | |
| head = text.lstrip() | |
| return head.startswith("#") or "\tTRUE\t" in text or "\tFALSE\t" in text | |
| def _maybe_b64_decode(text: str) -> str | None: | |
| compact = "".join(text.split()) | |
| if len(compact) < 16 or re.search(r"[^A-Za-z0-9+/=]", compact): | |
| return None | |
| try: | |
| decoded = base64.b64decode(compact, validate=True).decode("utf-8", "replace") | |
| except (binascii.Error, ValueError): | |
| return None | |
| return decoded if _looks_like_netscape(decoded) else None | |
| def _cookiefile(workdir: str) -> str | None: | |
| """Materialize the optional YT_COOKIES secret to a Netscape file; return path or None.""" | |
| data = os.environ.get("YT_COOKIES") | |
| if not data or not data.strip(): | |
| return None | |
| if not _looks_like_netscape(data): | |
| decoded = _maybe_b64_decode(data) | |
| if decoded: | |
| data = decoded | |
| if not data.lstrip().startswith(("# Netscape", "# HTTP")): | |
| data = "# Netscape HTTP Cookie File\n" + data.lstrip("\n") | |
| if not data.endswith("\n"): | |
| data += "\n" | |
| path = os.path.join(workdir, "cookies.txt") | |
| with open(path, "w", encoding="utf-8", newline="\n") as fh: | |
| fh.write(data) | |
| return path | |
| def _resolve_proxy(ui_proxy: str | None = None) -> str | None: | |
| """A per-user proxy pasted in the UI wins over the shared YT_PROXY secret, so each | |
| user can route through their own residential proxy (see tools/home_proxy_panel.py). | |
| It also backs screenshot downloads (capture_shots falls back to it).""" | |
| proxy = (ui_proxy or "").strip() or os.environ.get("YT_PROXY", "").strip() | |
| return proxy or None | |
| def _resolve_media_proxy() -> str | None: | |
| """Plain HTTP proxy for large media (googlevideo.com) downloads. The TLS YT_PROXY | |
| can't sustain multi-MB transfers, so screenshots use this instead; falls back to | |
| YT_PROXY when unset.""" | |
| mp = os.environ.get("YT_MEDIA_PROXY", "").strip() | |
| return mp or None | |
| def _resolve_api_key(ui_key: str | None) -> str | None: | |
| key = (ui_key or "").strip() or os.environ.get("YOUTUBE_API_KEY", "").strip() | |
| return key or None | |
| def check_access(yt_proxy=None): | |
| """Health check: which secrets are set, the egress IP, and YouTube reachability.""" | |
| import requests | |
| proxy = _resolve_proxy(yt_proxy) | |
| proxies = {"http": proxy, "https": proxy} if proxy else None | |
| src = "your proxy field" if (yt_proxy or "").strip() else "YT_PROXY secret" | |
| lines = [ | |
| f"- **Proxy**: {('set ✅ (' + src + ')') if proxy else 'not set ⚪'}", | |
| f"- **Media proxy** (`YT_MEDIA_PROXY`, screenshots): " | |
| f"{'set ✅' if _resolve_media_proxy() else 'not set ⚪ (falls back to YT_PROXY)'}", | |
| f"- **Cookies** (`YT_COOKIES`): {'set ✅' if os.environ.get('YT_COOKIES') else 'not set ⚪'}", | |
| f"- **Data API key** (`YOUTUBE_API_KEY`): {'set ✅' if os.environ.get('YOUTUBE_API_KEY') else 'not set ⚪'}", | |
| ] | |
| try: | |
| ip = requests.get("https://api.ipify.org", proxies=proxies, timeout=20).text.strip() | |
| lines.append(f"- **Egress IP** {'(via proxy)' if proxy else '(Space direct)'}: `{ip}`") | |
| except Exception as exc: | |
| lines.append(f"- **Egress IP**: ❌ {type(exc).__name__}") | |
| def probe(url): | |
| last = None | |
| for _ in range(2): # tolerate a single transient reset | |
| try: | |
| r = requests.get(url, proxies=proxies, timeout=20) | |
| return True, f"HTTP {r.status_code}" | |
| except Exception as exc: | |
| last = exc | |
| time.sleep(1.0) | |
| return False, f"{type(last).__name__}: {str(last)[:90]}" | |
| # Discriminating battery: neutral-small vs Google-family (same big cert as | |
| # YouTube, different hostname) vs YouTube itself. The pass/fail pattern says | |
| # whether it's a size/MTU blackhole, hostname-based egress filtering, or reset. | |
| probes = [ | |
| ("example.com (neutral)", "https://example.com"), | |
| ("google.com/204 (Google cert, non-YT name)", "https://www.google.com/generate_204"), | |
| ("i.ytimg.com (YT image CDN)", "https://i.ytimg.com/generate_204"), | |
| ("youtubei.googleapis.com (YT API)", "https://youtubei.googleapis.com/generate_204"), | |
| ("youtube.com/robots.txt", "https://www.youtube.com/robots.txt"), | |
| ("youtube.com/ (large body)", "https://www.youtube.com/"), | |
| ] | |
| results = {} | |
| for label, url in probes: | |
| ok, msg = probe(url) | |
| results[label] = ok | |
| lines.append(f"- {'✅' if ok else '❌'} {label}: {msg}") | |
| yt_ok = results.get("youtube.com/robots.txt") | |
| if yt_ok: | |
| verdict = "### 🟢 YouTube is reachable — transcript + screenshots should work." | |
| elif results.get("google.com/204 (Google cert, non-YT name)"): | |
| verdict = ("### 🔴 YouTube is filtered by hostname.\n" | |
| "Google works but YouTube is reset → the network between the Space and the " | |
| "tunnel is dropping the plaintext `CONNECT www.youtube.com`. Fix: run the " | |
| "home proxy as an **HTTPS proxy** (TLS-wrapped) so the target host is hidden.") | |
| elif results.get("example.com (neutral)") and not results.get("google.com/204 (Google cert, non-YT name)"): | |
| verdict = ("### 🔴 Large TLS handshakes are being dropped (MTU blackhole).\n" | |
| "Small sites work; Google/YouTube (large cert chains) reset. Fix: clamp MSS on " | |
| "the tunnel path or switch tunnel provider (e.g. ngrok).") | |
| else: | |
| verdict = ("### 🔴 YouTube is NOT reachable from the Space.\n" | |
| "The proxy connects (egress IP works) but TLS to YouTube is failing. " | |
| "Try again in a minute or refresh **`YT_PROXY`** via the panel in " | |
| "[`tools/`](tools/README.md).") | |
| return verdict + "\n\n" + "\n".join(lines) | |
| # ----------------------------------------------------------------------------- helpers | |
| def _ranking_rows(scored: list[dict]) -> list[list]: | |
| rows = [] | |
| for rank, v in enumerate(scored, start=1): | |
| rows.append([ | |
| rank, | |
| v.get("title", v["video_id"]), | |
| f"{v.get('positive_share', 0) * 100:.0f}%", | |
| v.get("n_comments", 0), | |
| v.get("note", "") or "ok", | |
| v["url"], | |
| ]) | |
| return rows | |
| def _safe_name(text: str) -> str: | |
| return re.sub(r"[^A-Za-z0-9._-]+", "_", text).strip("_")[:60] or "tutorial" | |
| def _collect_keywords(primary_kw, secondary_kw) -> dict: | |
| primary = (primary_kw or "").strip() | |
| secondary, seen = [], {primary.lower()} | |
| for part in (secondary_kw or "").split(","): | |
| kw = part.strip() | |
| if kw and kw.lower() not in seen: | |
| seen.add(kw.lower()) | |
| secondary.append(kw) | |
| return {"primary": primary, "secondary": secondary} | |
| def run_pipeline(topic, hf_token, yt_api_key, yt_proxy, llm_model, vlm_model, | |
| w_llm, w_whisper, lead, max_shots, primary_kw, secondary_kw, | |
| content_brief="", progress=gr.Progress()): | |
| """Generator yielding (status_md, ranking_df, transcript, docx_file).""" | |
| log: list[str] = [] | |
| def status(msg: str): | |
| log.append(msg) | |
| return "\n\n".join(log) | |
| topic = (topic or "").strip() | |
| if not topic: | |
| raise gr.Error("Please enter a topic.") | |
| if not (hf_token or "").strip(): | |
| raise gr.Error("Please paste your Hugging Face token (used for the LLM + vision model).") | |
| workdir = tempfile.mkdtemp(prefix="ytt_") | |
| frames_dir = os.path.join(workdir, "frames") | |
| try: | |
| api_key = _resolve_api_key(yt_api_key) | |
| cookiefile = _cookiefile(workdir) | |
| proxy = _resolve_proxy(yt_proxy) | |
| if proxy: | |
| # Never log the proxy URL itself — it carries user:pass credentials. | |
| src = "your proxy" if (yt_proxy or "").strip() else "the `YT_PROXY` secret" | |
| log.append(f"🔐 Routing transcript + screenshot requests through {src}.") | |
| # 1. Search ------------------------------------------------------------------ | |
| progress(0.03, desc="Searching") | |
| yield status(f"🔍 Searching top videos for **{topic}**…"), gr.update(), gr.update(), gr.update() | |
| videos = search_mod.search_top5(topic) | |
| yield status(f"Found {len(videos)} candidate videos."), gr.update(), gr.update(), gr.update() | |
| # 2. Sentiment ranking (YouTube Data API comments) --------------------------- | |
| if api_key: | |
| yield status("💬 Fetching comments (YouTube Data API) and scoring sentiment…"), gr.update(), gr.update(), gr.update() | |
| best, scored = sentiment_mod.rank_by_sentiment(videos, api_key, progress) | |
| picked_msg = f"({best['positive_share'] * 100:.0f}% positive comments)" | |
| else: | |
| best = videos[0] | |
| scored = [{**v, "positive_share": 0.0, "n_comments": 0, | |
| "note": "sentiment skipped (no API key)", "search_rank": i} | |
| for i, v in enumerate(videos)] | |
| picked_msg = "(no YouTube Data API key → used top search result)" | |
| ranking = gr.update(value=_ranking_rows(scored)) | |
| yield (status(f"🏆 Picked **{best.get('title', best['video_id'])}** {picked_msg}."), | |
| ranking, gr.update(), gr.update()) | |
| # 3. Transcript (youtube-transcript-api) ------------------------------------- | |
| progress(0.3, desc="Transcript") | |
| yield status("📝 Fetching the timestamped transcript…"), ranking, gr.update(), gr.update() | |
| segs = transcribe_mod.get_segments(best["video_id"], proxy=proxy) | |
| transcript = transcribe_mod.transcript_text(segs) | |
| yield (status(f"Transcript ready ({len(segs)} segments)."), | |
| ranking, gr.update(value=transcript), gr.update()) | |
| # 5. Tutorial text ----------------------------------------------------------- | |
| progress(0.6, desc="Writing tutorial") | |
| keywords = _collect_keywords(primary_kw, secondary_kw) | |
| kw_note = f" • primary: '{keywords['primary']}'" if keywords["primary"] else "" | |
| if keywords["secondary"]: | |
| kw_note += f" • secondary: {', '.join(keywords['secondary'])}" | |
| if (content_brief or "").strip(): | |
| kw_note += " • honoring your content brief" | |
| yield status(f"🤖 Generating tutorial with `{llm_model}`{kw_note}…"), ranking, gr.update(value=transcript), gr.update() | |
| tut = tutorial_mod.generate_tutorial(transcript, hf_token.strip(), llm_model, | |
| keywords, brief=content_brief) | |
| if keywords["primary"]: | |
| n = tutorial_mod.count_keyword(tut, keywords["primary"]) | |
| yield (status(f"🔑 Primary keyword '{keywords['primary']}' appears {n}× in the post."), | |
| ranking, gr.update(value=transcript), gr.update()) | |
| # 6. Screenshots (weighted timestamps -> yt-dlp clip download -> ffmpeg) ------ | |
| selected, caps = {}, {} | |
| times = frames_mod.compute_shot_times( | |
| tut["steps"], segs, w_llm=float(w_llm), w_whisper=float(w_whisper), | |
| lead=float(lead), max_shots=int(max_shots)) | |
| if times: | |
| progress(0.75, desc="Screenshots") | |
| yield status(f"🎞️ Capturing {len(times)} screenshots at weighted timestamps…"), ranking, gr.update(value=transcript), gr.update() | |
| try: | |
| selected = frames_mod.capture_shots( | |
| times, best["video_id"], frames_dir, cookiefile, proxy, | |
| _resolve_media_proxy(), progress) | |
| except Exception as exc: | |
| yield (status(f"⚠️ Couldn't fetch screenshots — text-only tutorial. " | |
| f"`{type(exc).__name__}: {str(exc)[:400]}`"), | |
| ranking, gr.update(value=transcript), gr.update()) | |
| if selected: | |
| progress(0.88, desc="Captioning") | |
| yield status(f"✍️ Captioning {len(selected)} screenshots with `{vlm_model}`…"), ranking, gr.update(value=transcript), gr.update() | |
| caps = captions_mod.caption_frames(selected, tut["steps"], hf_token.strip(), vlm_model, progress) | |
| # 7. DOCX -------------------------------------------------------------------- | |
| progress(0.96, desc="Building document") | |
| out_path = os.path.join(workdir, f"{_safe_name(tut['title'])}.docx") | |
| docx_builder.build_docx(tut, selected, caps, out_path, source_url=best["url"]) | |
| progress(1.0, desc="Done") | |
| shots_msg = f"{len(selected)} screenshots" if selected else "text-only" | |
| yield (status(f"✅ Done ({shots_msg}). Download your tutorial below."), | |
| ranking, gr.update(value=transcript), gr.update(value=out_path)) | |
| except gr.Error: | |
| raise | |
| except (transcribe_mod.TranscriptError, sentiment_mod.SentimentError, | |
| RuntimeError, ValueError) as exc: | |
| raise gr.Error(str(exc)) | |
| def build_ui(): | |
| with gr.Blocks(title="YouTube → Tutorial Post") as demo: | |
| gr.Markdown( | |
| "# 📝 YouTube → Tutorial Post Generator\n" | |
| "Enter a topic, your **Hugging Face token** (LLM + vision model, billed to you) " | |
| "and a **YouTube Data API key** (for comments). The Space picks the best video by " | |
| "comment sentiment, pulls its transcript, writes an AEO-friendly tutorial, grabs " | |
| "real screenshots at the right moments, and builds a **.docx**.\n\n" | |
| "> 🏠 **YouTube blocks this Space's datacenter IP.** Run your own residential proxy " | |
| "with the **Home Proxy Panel** " | |
| "([download / source](https://github.com/vivekchakraverty/tutorialmaker-home-proxy-panel)) " | |
| "and paste its URL into **Your proxy URL** below — transcript + screenshots then route " | |
| "through your home IP. Each user brings their own proxy." | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| topic = gr.Textbox(label="Topic", placeholder="e.g. Excel pivot tables for beginners") | |
| hf_token = gr.Textbox(label="Hugging Face token", type="password", | |
| placeholder="hf_… (Inference Providers permission)") | |
| yt_api_key = gr.Textbox(label="YouTube Data API key", type="password", | |
| placeholder="for comments (or set the YOUTUBE_API_KEY secret)") | |
| yt_proxy = gr.Textbox( | |
| label="Your proxy URL (recommended)", type="password", | |
| placeholder="http://user:pass@bore.pub:12345 — from the Home Proxy Panel", | |
| info="Run the Home Proxy Panel (github.com/vivekchakraverty/" | |
| "tutorialmaker-home-proxy-panel) on your own machine and paste its " | |
| "proxy URL here so YouTube requests exit from your residential IP. " | |
| "Leave blank to use the Space's shared proxy, if configured.") | |
| with gr.Column(scale=1): | |
| llm_model = gr.Dropdown(LLM_CHOICES, value=LLM_CHOICES[0], | |
| label="Tutorial LLM", allow_custom_value=True) | |
| vlm_model = gr.Dropdown(VLM_CHOICES, value=VLM_CHOICES[0], | |
| label="Vision model (captions)", allow_custom_value=True) | |
| content_brief = gr.Textbox( | |
| label="What the content must cover (optional)", | |
| lines=4, | |
| placeholder=("List the points, steps, or questions the tutorial must address — " | |
| "one per line or comma-separated.\n" | |
| "e.g. How to create a pivot table\nHow to refresh data\n" | |
| "Common pivot table errors"), | |
| info=("Required coverage for the tutorial writer. Leave blank to just follow " | |
| "the video. Screenshots stay optional — if none are suitable the post is " | |
| "produced without them."), | |
| ) | |
| with gr.Accordion("SEO / AEO keywords (optional)", open=False): | |
| gr.Markdown( | |
| "The **primary keyword** is used naturally ~3× in the body and placed in " | |
| "the title, URL slug, meta description, the first 100 words, and one or two " | |
| "H2 headings. Each **secondary keyword** is used once. The post also follows " | |
| "answer-engine best practices (direct answer up top, FAQ, last-updated date, " | |
| "source citation)." | |
| ) | |
| primary_kw = gr.Textbox(label="Primary keyword", placeholder="e.g. excel pivot tables") | |
| secondary_kw = gr.Textbox(label="Secondary keywords (comma-separated)", | |
| placeholder="e.g. pivot chart, data summary") | |
| with gr.Accordion("Advanced settings", open=False): | |
| with gr.Row(): | |
| w_llm = gr.Slider(0.0, 1.0, value=0.4, step=0.05, label="Weight: LLM timestamp") | |
| w_whisper = gr.Slider(0.0, 1.0, value=0.6, step=0.05, label="Weight: transcript timing") | |
| lead = gr.Slider(0.0, 5.0, value=1.0, step=0.5, label="Lead offset (s)") | |
| max_shots = gr.Slider(1, 15, value=8, step=1, label="Max screenshots") | |
| with gr.Row(): | |
| health_btn = gr.Button("🩺 Check YouTube access (proxy + reachability)") | |
| health_md = gr.Markdown() | |
| run_btn = gr.Button("Generate tutorial", variant="primary") | |
| status_md = gr.Markdown(label="Status") | |
| ranking_df = gr.Dataframe( | |
| headers=["#", "Title", "Positive", "Comments", "Note", "URL"], | |
| label="Sentiment ranking", interactive=False, wrap=True, | |
| ) | |
| transcript_box = gr.Textbox(label="Transcript preview", lines=10, max_lines=20) | |
| docx_file = gr.File(label="Download tutorial (.docx)") | |
| health_btn.click(check_access, inputs=[yt_proxy], outputs=health_md) | |
| run_btn.click( | |
| run_pipeline, | |
| inputs=[topic, hf_token, yt_api_key, yt_proxy, llm_model, vlm_model, | |
| w_llm, w_whisper, lead, max_shots, primary_kw, secondary_kw, | |
| content_brief], | |
| outputs=[status_md, ranking_df, transcript_box, docx_file], | |
| ) | |
| return demo | |
| # Expose a module-level `demo` so HF Spaces' SSR launcher finds it (avoids the | |
| # "Launching demo not found in __main__" fallback warning). | |
| demo = build_ui() | |
| demo.queue() | |
| if __name__ == "__main__": | |
| demo.launch() | |