TutorialMaker / app.py
vivekchakraverty's picture
Per-user residential proxy: proxy URL field + downloadable Home Proxy Panel
12e4183
Raw
History Blame Contribute Delete
20.9 kB
"""Gradio Space: YouTube topic -> captioned .docx tutorial (API-based acquisition).
Pipeline:
search top videos -> rank by YouTube Data API comment sentiment -> transcript via
youtube-transcript-api -> DeepSeek-V3 tutorial -> real screenshots via yt-dlp stream URL
+ ffmpeg -ss (weighted timestamps) -> VLM captions -> .docx.
No video download and no cookies/proxy/PO-token UI. A thin fallback remains via the
optional Space secrets YT_COOKIES / YT_PROXY (used only to resolve the stream URL and the
transcript when the Space's datacenter IP is blocked).
"""
from __future__ import annotations
import base64
import binascii
import os
import re
import shutil
import tempfile
import time
def _install_proxy_ca() -> None:
"""Trust a self-signed proxy CA (secret ``YT_PROXY_CA``) alongside the system
roots, so an HTTPS (TLS-wrapped) ``YT_PROXY`` validates. TLS-wrapping hides the
target host (``CONNECT www.youtube.com``) from the Space's egress DPI, which
otherwise resets YouTube-bound connections. Covers requests / youtube-transcript-api
(``REQUESTS_CA_BUNDLE``) and stdlib ssl / yt-dlp (``SSL_CERT_FILE``)."""
ca = os.environ.get("YT_PROXY_CA", "").strip()
if not ca:
return
try:
import certifi
bundle = os.path.join(tempfile.gettempdir(), "yt_proxy_ca_bundle.pem")
with open(certifi.where(), encoding="utf-8") as fh:
roots = fh.read()
with open(bundle, "w", encoding="utf-8") as fh:
fh.write(roots.rstrip() + "\n" + ca + "\n")
os.environ["REQUESTS_CA_BUNDLE"] = bundle # requests / youtube-transcript-api
os.environ["SSL_CERT_FILE"] = bundle # stdlib ssl + yt-dlp (see frames.py)
except Exception:
pass
_install_proxy_ca()
import gradio as gr
from pipeline import (
captions as captions_mod,
docx_builder,
frames as frames_mod,
search as search_mod,
sentiment as sentiment_mod,
transcribe as transcribe_mod,
tutorial as tutorial_mod,
)
LLM_CHOICES = [
"deepseek-ai/DeepSeek-V3",
"meta-llama/Llama-3.3-70B-Instruct",
"openai/gpt-oss-120b",
]
VLM_CHOICES = [
"Qwen/Qwen2.5-VL-72B-Instruct",
"Qwen/Qwen2.5-VL-7B-Instruct",
"meta-llama/Llama-3.2-90B-Vision-Instruct",
]
# --------------------------------------------------------------- thin access fallback
def _looks_like_netscape(text: str) -> bool:
head = text.lstrip()
return head.startswith("#") or "\tTRUE\t" in text or "\tFALSE\t" in text
def _maybe_b64_decode(text: str) -> str | None:
compact = "".join(text.split())
if len(compact) < 16 or re.search(r"[^A-Za-z0-9+/=]", compact):
return None
try:
decoded = base64.b64decode(compact, validate=True).decode("utf-8", "replace")
except (binascii.Error, ValueError):
return None
return decoded if _looks_like_netscape(decoded) else None
def _cookiefile(workdir: str) -> str | None:
"""Materialize the optional YT_COOKIES secret to a Netscape file; return path or None."""
data = os.environ.get("YT_COOKIES")
if not data or not data.strip():
return None
if not _looks_like_netscape(data):
decoded = _maybe_b64_decode(data)
if decoded:
data = decoded
if not data.lstrip().startswith(("# Netscape", "# HTTP")):
data = "# Netscape HTTP Cookie File\n" + data.lstrip("\n")
if not data.endswith("\n"):
data += "\n"
path = os.path.join(workdir, "cookies.txt")
with open(path, "w", encoding="utf-8", newline="\n") as fh:
fh.write(data)
return path
def _resolve_proxy(ui_proxy: str | None = None) -> str | None:
"""A per-user proxy pasted in the UI wins over the shared YT_PROXY secret, so each
user can route through their own residential proxy (see tools/home_proxy_panel.py).
It also backs screenshot downloads (capture_shots falls back to it)."""
proxy = (ui_proxy or "").strip() or os.environ.get("YT_PROXY", "").strip()
return proxy or None
def _resolve_media_proxy() -> str | None:
"""Plain HTTP proxy for large media (googlevideo.com) downloads. The TLS YT_PROXY
can't sustain multi-MB transfers, so screenshots use this instead; falls back to
YT_PROXY when unset."""
mp = os.environ.get("YT_MEDIA_PROXY", "").strip()
return mp or None
def _resolve_api_key(ui_key: str | None) -> str | None:
key = (ui_key or "").strip() or os.environ.get("YOUTUBE_API_KEY", "").strip()
return key or None
def check_access(yt_proxy=None):
"""Health check: which secrets are set, the egress IP, and YouTube reachability."""
import requests
proxy = _resolve_proxy(yt_proxy)
proxies = {"http": proxy, "https": proxy} if proxy else None
src = "your proxy field" if (yt_proxy or "").strip() else "YT_PROXY secret"
lines = [
f"- **Proxy**: {('set ✅ (' + src + ')') if proxy else 'not set ⚪'}",
f"- **Media proxy** (`YT_MEDIA_PROXY`, screenshots): "
f"{'set ✅' if _resolve_media_proxy() else 'not set ⚪ (falls back to YT_PROXY)'}",
f"- **Cookies** (`YT_COOKIES`): {'set ✅' if os.environ.get('YT_COOKIES') else 'not set ⚪'}",
f"- **Data API key** (`YOUTUBE_API_KEY`): {'set ✅' if os.environ.get('YOUTUBE_API_KEY') else 'not set ⚪'}",
]
try:
ip = requests.get("https://api.ipify.org", proxies=proxies, timeout=20).text.strip()
lines.append(f"- **Egress IP** {'(via proxy)' if proxy else '(Space direct)'}: `{ip}`")
except Exception as exc:
lines.append(f"- **Egress IP**: ❌ {type(exc).__name__}")
def probe(url):
last = None
for _ in range(2): # tolerate a single transient reset
try:
r = requests.get(url, proxies=proxies, timeout=20)
return True, f"HTTP {r.status_code}"
except Exception as exc:
last = exc
time.sleep(1.0)
return False, f"{type(last).__name__}: {str(last)[:90]}"
# Discriminating battery: neutral-small vs Google-family (same big cert as
# YouTube, different hostname) vs YouTube itself. The pass/fail pattern says
# whether it's a size/MTU blackhole, hostname-based egress filtering, or reset.
probes = [
("example.com (neutral)", "https://example.com"),
("google.com/204 (Google cert, non-YT name)", "https://www.google.com/generate_204"),
("i.ytimg.com (YT image CDN)", "https://i.ytimg.com/generate_204"),
("youtubei.googleapis.com (YT API)", "https://youtubei.googleapis.com/generate_204"),
("youtube.com/robots.txt", "https://www.youtube.com/robots.txt"),
("youtube.com/ (large body)", "https://www.youtube.com/"),
]
results = {}
for label, url in probes:
ok, msg = probe(url)
results[label] = ok
lines.append(f"- {'✅' if ok else '❌'} {label}: {msg}")
yt_ok = results.get("youtube.com/robots.txt")
if yt_ok:
verdict = "### 🟢 YouTube is reachable — transcript + screenshots should work."
elif results.get("google.com/204 (Google cert, non-YT name)"):
verdict = ("### 🔴 YouTube is filtered by hostname.\n"
"Google works but YouTube is reset → the network between the Space and the "
"tunnel is dropping the plaintext `CONNECT www.youtube.com`. Fix: run the "
"home proxy as an **HTTPS proxy** (TLS-wrapped) so the target host is hidden.")
elif results.get("example.com (neutral)") and not results.get("google.com/204 (Google cert, non-YT name)"):
verdict = ("### 🔴 Large TLS handshakes are being dropped (MTU blackhole).\n"
"Small sites work; Google/YouTube (large cert chains) reset. Fix: clamp MSS on "
"the tunnel path or switch tunnel provider (e.g. ngrok).")
else:
verdict = ("### 🔴 YouTube is NOT reachable from the Space.\n"
"The proxy connects (egress IP works) but TLS to YouTube is failing. "
"Try again in a minute or refresh **`YT_PROXY`** via the panel in "
"[`tools/`](tools/README.md).")
return verdict + "\n\n" + "\n".join(lines)
# ----------------------------------------------------------------------------- helpers
def _ranking_rows(scored: list[dict]) -> list[list]:
rows = []
for rank, v in enumerate(scored, start=1):
rows.append([
rank,
v.get("title", v["video_id"]),
f"{v.get('positive_share', 0) * 100:.0f}%",
v.get("n_comments", 0),
v.get("note", "") or "ok",
v["url"],
])
return rows
def _safe_name(text: str) -> str:
return re.sub(r"[^A-Za-z0-9._-]+", "_", text).strip("_")[:60] or "tutorial"
def _collect_keywords(primary_kw, secondary_kw) -> dict:
primary = (primary_kw or "").strip()
secondary, seen = [], {primary.lower()}
for part in (secondary_kw or "").split(","):
kw = part.strip()
if kw and kw.lower() not in seen:
seen.add(kw.lower())
secondary.append(kw)
return {"primary": primary, "secondary": secondary}
def run_pipeline(topic, hf_token, yt_api_key, yt_proxy, llm_model, vlm_model,
w_llm, w_whisper, lead, max_shots, primary_kw, secondary_kw,
content_brief="", progress=gr.Progress()):
"""Generator yielding (status_md, ranking_df, transcript, docx_file)."""
log: list[str] = []
def status(msg: str):
log.append(msg)
return "\n\n".join(log)
topic = (topic or "").strip()
if not topic:
raise gr.Error("Please enter a topic.")
if not (hf_token or "").strip():
raise gr.Error("Please paste your Hugging Face token (used for the LLM + vision model).")
workdir = tempfile.mkdtemp(prefix="ytt_")
frames_dir = os.path.join(workdir, "frames")
try:
api_key = _resolve_api_key(yt_api_key)
cookiefile = _cookiefile(workdir)
proxy = _resolve_proxy(yt_proxy)
if proxy:
# Never log the proxy URL itself — it carries user:pass credentials.
src = "your proxy" if (yt_proxy or "").strip() else "the `YT_PROXY` secret"
log.append(f"🔐 Routing transcript + screenshot requests through {src}.")
# 1. Search ------------------------------------------------------------------
progress(0.03, desc="Searching")
yield status(f"🔍 Searching top videos for **{topic}**…"), gr.update(), gr.update(), gr.update()
videos = search_mod.search_top5(topic)
yield status(f"Found {len(videos)} candidate videos."), gr.update(), gr.update(), gr.update()
# 2. Sentiment ranking (YouTube Data API comments) ---------------------------
if api_key:
yield status("💬 Fetching comments (YouTube Data API) and scoring sentiment…"), gr.update(), gr.update(), gr.update()
best, scored = sentiment_mod.rank_by_sentiment(videos, api_key, progress)
picked_msg = f"({best['positive_share'] * 100:.0f}% positive comments)"
else:
best = videos[0]
scored = [{**v, "positive_share": 0.0, "n_comments": 0,
"note": "sentiment skipped (no API key)", "search_rank": i}
for i, v in enumerate(videos)]
picked_msg = "(no YouTube Data API key → used top search result)"
ranking = gr.update(value=_ranking_rows(scored))
yield (status(f"🏆 Picked **{best.get('title', best['video_id'])}** {picked_msg}."),
ranking, gr.update(), gr.update())
# 3. Transcript (youtube-transcript-api) -------------------------------------
progress(0.3, desc="Transcript")
yield status("📝 Fetching the timestamped transcript…"), ranking, gr.update(), gr.update()
segs = transcribe_mod.get_segments(best["video_id"], proxy=proxy)
transcript = transcribe_mod.transcript_text(segs)
yield (status(f"Transcript ready ({len(segs)} segments)."),
ranking, gr.update(value=transcript), gr.update())
# 5. Tutorial text -----------------------------------------------------------
progress(0.6, desc="Writing tutorial")
keywords = _collect_keywords(primary_kw, secondary_kw)
kw_note = f" • primary: '{keywords['primary']}'" if keywords["primary"] else ""
if keywords["secondary"]:
kw_note += f" • secondary: {', '.join(keywords['secondary'])}"
if (content_brief or "").strip():
kw_note += " • honoring your content brief"
yield status(f"🤖 Generating tutorial with `{llm_model}`{kw_note}…"), ranking, gr.update(value=transcript), gr.update()
tut = tutorial_mod.generate_tutorial(transcript, hf_token.strip(), llm_model,
keywords, brief=content_brief)
if keywords["primary"]:
n = tutorial_mod.count_keyword(tut, keywords["primary"])
yield (status(f"🔑 Primary keyword '{keywords['primary']}' appears {n}× in the post."),
ranking, gr.update(value=transcript), gr.update())
# 6. Screenshots (weighted timestamps -> yt-dlp clip download -> ffmpeg) ------
selected, caps = {}, {}
times = frames_mod.compute_shot_times(
tut["steps"], segs, w_llm=float(w_llm), w_whisper=float(w_whisper),
lead=float(lead), max_shots=int(max_shots))
if times:
progress(0.75, desc="Screenshots")
yield status(f"🎞️ Capturing {len(times)} screenshots at weighted timestamps…"), ranking, gr.update(value=transcript), gr.update()
try:
selected = frames_mod.capture_shots(
times, best["video_id"], frames_dir, cookiefile, proxy,
_resolve_media_proxy(), progress)
except Exception as exc:
yield (status(f"⚠️ Couldn't fetch screenshots — text-only tutorial. "
f"`{type(exc).__name__}: {str(exc)[:400]}`"),
ranking, gr.update(value=transcript), gr.update())
if selected:
progress(0.88, desc="Captioning")
yield status(f"✍️ Captioning {len(selected)} screenshots with `{vlm_model}`…"), ranking, gr.update(value=transcript), gr.update()
caps = captions_mod.caption_frames(selected, tut["steps"], hf_token.strip(), vlm_model, progress)
# 7. DOCX --------------------------------------------------------------------
progress(0.96, desc="Building document")
out_path = os.path.join(workdir, f"{_safe_name(tut['title'])}.docx")
docx_builder.build_docx(tut, selected, caps, out_path, source_url=best["url"])
progress(1.0, desc="Done")
shots_msg = f"{len(selected)} screenshots" if selected else "text-only"
yield (status(f"✅ Done ({shots_msg}). Download your tutorial below."),
ranking, gr.update(value=transcript), gr.update(value=out_path))
except gr.Error:
raise
except (transcribe_mod.TranscriptError, sentiment_mod.SentimentError,
RuntimeError, ValueError) as exc:
raise gr.Error(str(exc))
def build_ui():
with gr.Blocks(title="YouTube → Tutorial Post") as demo:
gr.Markdown(
"# 📝 YouTube → Tutorial Post Generator\n"
"Enter a topic, your **Hugging Face token** (LLM + vision model, billed to you) "
"and a **YouTube Data API key** (for comments). The Space picks the best video by "
"comment sentiment, pulls its transcript, writes an AEO-friendly tutorial, grabs "
"real screenshots at the right moments, and builds a **.docx**.\n\n"
"> 🏠 **YouTube blocks this Space's datacenter IP.** Run your own residential proxy "
"with the **Home Proxy Panel** "
"([download / source](https://github.com/vivekchakraverty/tutorialmaker-home-proxy-panel)) "
"and paste its URL into **Your proxy URL** below — transcript + screenshots then route "
"through your home IP. Each user brings their own proxy."
)
with gr.Row():
with gr.Column(scale=2):
topic = gr.Textbox(label="Topic", placeholder="e.g. Excel pivot tables for beginners")
hf_token = gr.Textbox(label="Hugging Face token", type="password",
placeholder="hf_… (Inference Providers permission)")
yt_api_key = gr.Textbox(label="YouTube Data API key", type="password",
placeholder="for comments (or set the YOUTUBE_API_KEY secret)")
yt_proxy = gr.Textbox(
label="Your proxy URL (recommended)", type="password",
placeholder="http://user:pass@bore.pub:12345 — from the Home Proxy Panel",
info="Run the Home Proxy Panel (github.com/vivekchakraverty/"
"tutorialmaker-home-proxy-panel) on your own machine and paste its "
"proxy URL here so YouTube requests exit from your residential IP. "
"Leave blank to use the Space's shared proxy, if configured.")
with gr.Column(scale=1):
llm_model = gr.Dropdown(LLM_CHOICES, value=LLM_CHOICES[0],
label="Tutorial LLM", allow_custom_value=True)
vlm_model = gr.Dropdown(VLM_CHOICES, value=VLM_CHOICES[0],
label="Vision model (captions)", allow_custom_value=True)
content_brief = gr.Textbox(
label="What the content must cover (optional)",
lines=4,
placeholder=("List the points, steps, or questions the tutorial must address — "
"one per line or comma-separated.\n"
"e.g. How to create a pivot table\nHow to refresh data\n"
"Common pivot table errors"),
info=("Required coverage for the tutorial writer. Leave blank to just follow "
"the video. Screenshots stay optional — if none are suitable the post is "
"produced without them."),
)
with gr.Accordion("SEO / AEO keywords (optional)", open=False):
gr.Markdown(
"The **primary keyword** is used naturally ~3× in the body and placed in "
"the title, URL slug, meta description, the first 100 words, and one or two "
"H2 headings. Each **secondary keyword** is used once. The post also follows "
"answer-engine best practices (direct answer up top, FAQ, last-updated date, "
"source citation)."
)
primary_kw = gr.Textbox(label="Primary keyword", placeholder="e.g. excel pivot tables")
secondary_kw = gr.Textbox(label="Secondary keywords (comma-separated)",
placeholder="e.g. pivot chart, data summary")
with gr.Accordion("Advanced settings", open=False):
with gr.Row():
w_llm = gr.Slider(0.0, 1.0, value=0.4, step=0.05, label="Weight: LLM timestamp")
w_whisper = gr.Slider(0.0, 1.0, value=0.6, step=0.05, label="Weight: transcript timing")
lead = gr.Slider(0.0, 5.0, value=1.0, step=0.5, label="Lead offset (s)")
max_shots = gr.Slider(1, 15, value=8, step=1, label="Max screenshots")
with gr.Row():
health_btn = gr.Button("🩺 Check YouTube access (proxy + reachability)")
health_md = gr.Markdown()
run_btn = gr.Button("Generate tutorial", variant="primary")
status_md = gr.Markdown(label="Status")
ranking_df = gr.Dataframe(
headers=["#", "Title", "Positive", "Comments", "Note", "URL"],
label="Sentiment ranking", interactive=False, wrap=True,
)
transcript_box = gr.Textbox(label="Transcript preview", lines=10, max_lines=20)
docx_file = gr.File(label="Download tutorial (.docx)")
health_btn.click(check_access, inputs=[yt_proxy], outputs=health_md)
run_btn.click(
run_pipeline,
inputs=[topic, hf_token, yt_api_key, yt_proxy, llm_model, vlm_model,
w_llm, w_whisper, lead, max_shots, primary_kw, secondary_kw,
content_brief],
outputs=[status_md, ranking_df, transcript_box, docx_file],
)
return demo
# Expose a module-level `demo` so HF Spaces' SSR launcher finds it (avoids the
# "Launching demo not found in __main__" fallback warning).
demo = build_ui()
demo.queue()
if __name__ == "__main__":
demo.launch()