Spaces:

vivekchakraverty
/

TutorialMaker

Sleeping

App Files Files Community

TutorialMaker / app.py

vivekchakraverty

Per-user residential proxy: proxy URL field + downloadable Home Proxy Panel

12e4183 4 days ago

Raw

History Blame Contribute Delete

20.9 kB

	"""Gradio Space: YouTube topic -> captioned .docx tutorial (API-based acquisition).

	Pipeline:
	search top videos -> rank by YouTube Data API comment sentiment -> transcript via
	youtube-transcript-api -> DeepSeek-V3 tutorial -> real screenshots via yt-dlp stream URL
	+ ffmpeg -ss (weighted timestamps) -> VLM captions -> .docx.

	No video download and no cookies/proxy/PO-token UI. A thin fallback remains via the
	optional Space secrets YT_COOKIES / YT_PROXY (used only to resolve the stream URL and the
	transcript when the Space's datacenter IP is blocked).
	"""
	from __future__ import annotations

	import base64
	import binascii
	import os
	import re
	import shutil
	import tempfile
	import time


	def _install_proxy_ca() -> None:
	"""Trust a self-signed proxy CA (secret ``YT_PROXY_CA``) alongside the system
	roots, so an HTTPS (TLS-wrapped) ``YT_PROXY`` validates. TLS-wrapping hides the
	target host (``CONNECT www.youtube.com``) from the Space's egress DPI, which
	otherwise resets YouTube-bound connections. Covers requests / youtube-transcript-api
	(``REQUESTS_CA_BUNDLE``) and stdlib ssl / yt-dlp (``SSL_CERT_FILE``)."""
	ca = os.environ.get("YT_PROXY_CA", "").strip()
	if not ca:
	return
	try:
	import certifi
	bundle = os.path.join(tempfile.gettempdir(), "yt_proxy_ca_bundle.pem")
	with open(certifi.where(), encoding="utf-8") as fh:
	roots = fh.read()
	with open(bundle, "w", encoding="utf-8") as fh:
	fh.write(roots.rstrip() + "\n" + ca + "\n")
	os.environ["REQUESTS_CA_BUNDLE"] = bundle # requests / youtube-transcript-api
	os.environ["SSL_CERT_FILE"] = bundle # stdlib ssl + yt-dlp (see frames.py)
	except Exception:
	pass


	_install_proxy_ca()

	import gradio as gr

	from pipeline import (
	captions as captions_mod,
	docx_builder,
	frames as frames_mod,
	search as search_mod,
	sentiment as sentiment_mod,
	transcribe as transcribe_mod,
	tutorial as tutorial_mod,
	)

	LLM_CHOICES = [
	"deepseek-ai/DeepSeek-V3",
	"meta-llama/Llama-3.3-70B-Instruct",
	"openai/gpt-oss-120b",
	]
	VLM_CHOICES = [
	"Qwen/Qwen2.5-VL-72B-Instruct",
	"Qwen/Qwen2.5-VL-7B-Instruct",
	"meta-llama/Llama-3.2-90B-Vision-Instruct",
	]


	# --------------------------------------------------------------- thin access fallback
	def _looks_like_netscape(text: str) -> bool:
	head = text.lstrip()
	return head.startswith("#") or "\tTRUE\t" in text or "\tFALSE\t" in text


	def _maybe_b64_decode(text: str) -> str \| None:
	compact = "".join(text.split())
	if len(compact) < 16 or re.search(r"[^A-Za-z0-9+/=]", compact):
	return None
	try:
	decoded = base64.b64decode(compact, validate=True).decode("utf-8", "replace")
	except (binascii.Error, ValueError):
	return None
	return decoded if _looks_like_netscape(decoded) else None


	def _cookiefile(workdir: str) -> str \| None:
	"""Materialize the optional YT_COOKIES secret to a Netscape file; return path or None."""
	data = os.environ.get("YT_COOKIES")
	if not data or not data.strip():
	return None
	if not _looks_like_netscape(data):
	decoded = _maybe_b64_decode(data)
	if decoded:
	data = decoded
	if not data.lstrip().startswith(("# Netscape", "# HTTP")):
	data = "# Netscape HTTP Cookie File\n" + data.lstrip("\n")
	if not data.endswith("\n"):
	data += "\n"
	path = os.path.join(workdir, "cookies.txt")
	with open(path, "w", encoding="utf-8", newline="\n") as fh:
	fh.write(data)
	return path


	def _resolve_proxy(ui_proxy: str \| None = None) -> str \| None:
	"""A per-user proxy pasted in the UI wins over the shared YT_PROXY secret, so each
	user can route through their own residential proxy (see tools/home_proxy_panel.py).
	It also backs screenshot downloads (capture_shots falls back to it)."""
	proxy = (ui_proxy or "").strip() or os.environ.get("YT_PROXY", "").strip()
	return proxy or None


	def _resolve_media_proxy() -> str \| None:
	"""Plain HTTP proxy for large media (googlevideo.com) downloads. The TLS YT_PROXY
	can't sustain multi-MB transfers, so screenshots use this instead; falls back to
	YT_PROXY when unset."""
	mp = os.environ.get("YT_MEDIA_PROXY", "").strip()
	return mp or None


	def _resolve_api_key(ui_key: str \| None) -> str \| None:
	key = (ui_key or "").strip() or os.environ.get("YOUTUBE_API_KEY", "").strip()
	return key or None


	def check_access(yt_proxy=None):
	"""Health check: which secrets are set, the egress IP, and YouTube reachability."""
	import requests

	proxy = _resolve_proxy(yt_proxy)
	proxies = {"http": proxy, "https": proxy} if proxy else None
	src = "your proxy field" if (yt_proxy or "").strip() else "YT_PROXY secret"
	lines = [
	f"- Proxy: {('set ✅ (' + src + ')') if proxy else 'not set ⚪'}",
	f"- Media proxy (`YT_MEDIA_PROXY`, screenshots): "
	f"{'set ✅' if _resolve_media_proxy() else 'not set ⚪ (falls back to YT_PROXY)'}",
	f"- Cookies (`YT_COOKIES`): {'set ✅' if os.environ.get('YT_COOKIES') else 'not set ⚪'}",
	f"- Data API key (`YOUTUBE_API_KEY`): {'set ✅' if os.environ.get('YOUTUBE_API_KEY') else 'not set ⚪'}",
	]
	try:
	ip = requests.get("https://api.ipify.org", proxies=proxies, timeout=20).text.strip()
	lines.append(f"- Egress IP {'(via proxy)' if proxy else '(Space direct)'}: `{ip}`")
	except Exception as exc:
	lines.append(f"- Egress IP: ❌ {type(exc).__name__}")
	def probe(url):
	last = None
	for _ in range(2): # tolerate a single transient reset
	try:
	r = requests.get(url, proxies=proxies, timeout=20)
	return True, f"HTTP {r.status_code}"
	except Exception as exc:
	last = exc
	time.sleep(1.0)
	return False, f"{type(last).__name__}: {str(last)[:90]}"

	# Discriminating battery: neutral-small vs Google-family (same big cert as
	# YouTube, different hostname) vs YouTube itself. The pass/fail pattern says
	# whether it's a size/MTU blackhole, hostname-based egress filtering, or reset.
	probes = [
	("example.com (neutral)", "https://example.com"),
	("google.com/204 (Google cert, non-YT name)", "https://www.google.com/generate_204"),
	("i.ytimg.com (YT image CDN)", "https://i.ytimg.com/generate_204"),
	("youtubei.googleapis.com (YT API)", "https://youtubei.googleapis.com/generate_204"),
	("youtube.com/robots.txt", "https://www.youtube.com/robots.txt"),
	("youtube.com/ (large body)", "https://www.youtube.com/"),
	]
	results = {}
	for label, url in probes:
	ok, msg = probe(url)
	results[label] = ok
	lines.append(f"- {'✅' if ok else '❌'} {label}: {msg}")

	yt_ok = results.get("youtube.com/robots.txt")
	if yt_ok:
	verdict = "### 🟢 YouTube is reachable — transcript + screenshots should work."
	elif results.get("google.com/204 (Google cert, non-YT name)"):
	verdict = ("### 🔴 YouTube is filtered by hostname.\n"
	"Google works but YouTube is reset → the network between the Space and the "
	"tunnel is dropping the plaintext `CONNECT www.youtube.com`. Fix: run the "
	"home proxy as an HTTPS proxy (TLS-wrapped) so the target host is hidden.")
	elif results.get("example.com (neutral)") and not results.get("google.com/204 (Google cert, non-YT name)"):
	verdict = ("### 🔴 Large TLS handshakes are being dropped (MTU blackhole).\n"
	"Small sites work; Google/YouTube (large cert chains) reset. Fix: clamp MSS on "
	"the tunnel path or switch tunnel provider (e.g. ngrok).")
	else:
	verdict = ("### 🔴 YouTube is NOT reachable from the Space.\n"
	"The proxy connects (egress IP works) but TLS to YouTube is failing. "
	"Try again in a minute or refresh `YT_PROXY` via the panel in "
	"[`tools/`](tools/README.md).")
	return verdict + "\n\n" + "\n".join(lines)


	# ----------------------------------------------------------------------------- helpers
	def _ranking_rows(scored: list[dict]) -> list[list]:
	rows = []
	for rank, v in enumerate(scored, start=1):
	rows.append([
	rank,
	v.get("title", v["video_id"]),
	f"{v.get('positive_share', 0) * 100:.0f}%",
	v.get("n_comments", 0),
	v.get("note", "") or "ok",
	v["url"],
	])
	return rows


	def _safe_name(text: str) -> str:
	return re.sub(r"[^A-Za-z0-9._-]+", "_", text).strip("_")[:60] or "tutorial"


	def _collect_keywords(primary_kw, secondary_kw) -> dict:
	primary = (primary_kw or "").strip()
	secondary, seen = [], {primary.lower()}
	for part in (secondary_kw or "").split(","):
	kw = part.strip()
	if kw and kw.lower() not in seen:
	seen.add(kw.lower())
	secondary.append(kw)
	return {"primary": primary, "secondary": secondary}


	def run_pipeline(topic, hf_token, yt_api_key, yt_proxy, llm_model, vlm_model,
	w_llm, w_whisper, lead, max_shots, primary_kw, secondary_kw,
	content_brief="", progress=gr.Progress()):
	"""Generator yielding (status_md, ranking_df, transcript, docx_file)."""
	log: list[str] = []

	def status(msg: str):
	log.append(msg)
	return "\n\n".join(log)

	topic = (topic or "").strip()
	if not topic:
	raise gr.Error("Please enter a topic.")
	if not (hf_token or "").strip():
	raise gr.Error("Please paste your Hugging Face token (used for the LLM + vision model).")

	workdir = tempfile.mkdtemp(prefix="ytt_")
	frames_dir = os.path.join(workdir, "frames")
	try:
	api_key = _resolve_api_key(yt_api_key)
	cookiefile = _cookiefile(workdir)
	proxy = _resolve_proxy(yt_proxy)
	if proxy:
	# Never log the proxy URL itself — it carries user:pass credentials.
	src = "your proxy" if (yt_proxy or "").strip() else "the `YT_PROXY` secret"
	log.append(f"🔐 Routing transcript + screenshot requests through {src}.")

	# 1. Search ------------------------------------------------------------------
	progress(0.03, desc="Searching")
	yield status(f"🔍 Searching top videos for {topic}…"), gr.update(), gr.update(), gr.update()
	videos = search_mod.search_top5(topic)
	yield status(f"Found {len(videos)} candidate videos."), gr.update(), gr.update(), gr.update()

	# 2. Sentiment ranking (YouTube Data API comments) ---------------------------
	if api_key:
	yield status("💬 Fetching comments (YouTube Data API) and scoring sentiment…"), gr.update(), gr.update(), gr.update()
	best, scored = sentiment_mod.rank_by_sentiment(videos, api_key, progress)
	picked_msg = f"({best['positive_share'] * 100:.0f}% positive comments)"
	else:
	best = videos[0]
	scored = [{**v, "positive_share": 0.0, "n_comments": 0,
	"note": "sentiment skipped (no API key)", "search_rank": i}
	for i, v in enumerate(videos)]
	picked_msg = "(no YouTube Data API key → used top search result)"
	ranking = gr.update(value=_ranking_rows(scored))
	yield (status(f"🏆 Picked {best.get('title', best['video_id'])} {picked_msg}."),
	ranking, gr.update(), gr.update())

	# 3. Transcript (youtube-transcript-api) -------------------------------------
	progress(0.3, desc="Transcript")
	yield status("📝 Fetching the timestamped transcript…"), ranking, gr.update(), gr.update()
	segs = transcribe_mod.get_segments(best["video_id"], proxy=proxy)
	transcript = transcribe_mod.transcript_text(segs)
	yield (status(f"Transcript ready ({len(segs)} segments)."),
	ranking, gr.update(value=transcript), gr.update())

	# 5. Tutorial text -----------------------------------------------------------
	progress(0.6, desc="Writing tutorial")
	keywords = _collect_keywords(primary_kw, secondary_kw)
	kw_note = f" • primary: '{keywords['primary']}'" if keywords["primary"] else ""
	if keywords["secondary"]:
	kw_note += f" • secondary: {', '.join(keywords['secondary'])}"
	if (content_brief or "").strip():
	kw_note += " • honoring your content brief"
	yield status(f"🤖 Generating tutorial with `{llm_model}`{kw_note}…"), ranking, gr.update(value=transcript), gr.update()
	tut = tutorial_mod.generate_tutorial(transcript, hf_token.strip(), llm_model,
	keywords, brief=content_brief)
	if keywords["primary"]:
	n = tutorial_mod.count_keyword(tut, keywords["primary"])
	yield (status(f"🔑 Primary keyword '{keywords['primary']}' appears {n}× in the post."),
	ranking, gr.update(value=transcript), gr.update())

	# 6. Screenshots (weighted timestamps -> yt-dlp clip download -> ffmpeg) ------
	selected, caps = {}, {}
	times = frames_mod.compute_shot_times(
	tut["steps"], segs, w_llm=float(w_llm), w_whisper=float(w_whisper),
	lead=float(lead), max_shots=int(max_shots))
	if times:
	progress(0.75, desc="Screenshots")
	yield status(f"🎞️ Capturing {len(times)} screenshots at weighted timestamps…"), ranking, gr.update(value=transcript), gr.update()
	try:
	selected = frames_mod.capture_shots(
	times, best["video_id"], frames_dir, cookiefile, proxy,
	_resolve_media_proxy(), progress)
	except Exception as exc:
	yield (status(f"⚠️ Couldn't fetch screenshots — text-only tutorial. "
	f"`{type(exc).__name__}: {str(exc)[:400]}`"),
	ranking, gr.update(value=transcript), gr.update())
	if selected:
	progress(0.88, desc="Captioning")
	yield status(f"✍️ Captioning {len(selected)} screenshots with `{vlm_model}`…"), ranking, gr.update(value=transcript), gr.update()
	caps = captions_mod.caption_frames(selected, tut["steps"], hf_token.strip(), vlm_model, progress)

	# 7. DOCX --------------------------------------------------------------------
	progress(0.96, desc="Building document")
	out_path = os.path.join(workdir, f"{_safe_name(tut['title'])}.docx")
	docx_builder.build_docx(tut, selected, caps, out_path, source_url=best["url"])

	progress(1.0, desc="Done")
	shots_msg = f"{len(selected)} screenshots" if selected else "text-only"
	yield (status(f"✅ Done ({shots_msg}). Download your tutorial below."),
	ranking, gr.update(value=transcript), gr.update(value=out_path))

	except gr.Error:
	raise
	except (transcribe_mod.TranscriptError, sentiment_mod.SentimentError,
	RuntimeError, ValueError) as exc:
	raise gr.Error(str(exc))


	def build_ui():
	with gr.Blocks(title="YouTube → Tutorial Post") as demo:
	gr.Markdown(
	"# 📝 YouTube → Tutorial Post Generator\n"
	"Enter a topic, your Hugging Face token (LLM + vision model, billed to you) "
	"and a YouTube Data API key (for comments). The Space picks the best video by "
	"comment sentiment, pulls its transcript, writes an AEO-friendly tutorial, grabs "
	"real screenshots at the right moments, and builds a .docx.\n\n"
	"> 🏠 YouTube blocks this Space's datacenter IP. Run your own residential proxy "
	"with the Home Proxy Panel "
	"([download / source](https://github.com/vivekchakraverty/tutorialmaker-home-proxy-panel)) "
	"and paste its URL into Your proxy URL below — transcript + screenshots then route "
	"through your home IP. Each user brings their own proxy."
	)
	with gr.Row():
	with gr.Column(scale=2):
	topic = gr.Textbox(label="Topic", placeholder="e.g. Excel pivot tables for beginners")
	hf_token = gr.Textbox(label="Hugging Face token", type="password",
	placeholder="hf_… (Inference Providers permission)")
	yt_api_key = gr.Textbox(label="YouTube Data API key", type="password",
	placeholder="for comments (or set the YOUTUBE_API_KEY secret)")
	yt_proxy = gr.Textbox(
	label="Your proxy URL (recommended)", type="password",
	placeholder="http://user:pass@bore.pub:12345 — from the Home Proxy Panel",
	info="Run the Home Proxy Panel (github.com/vivekchakraverty/"
	"tutorialmaker-home-proxy-panel) on your own machine and paste its "
	"proxy URL here so YouTube requests exit from your residential IP. "
	"Leave blank to use the Space's shared proxy, if configured.")
	with gr.Column(scale=1):
	llm_model = gr.Dropdown(LLM_CHOICES, value=LLM_CHOICES[0],
	label="Tutorial LLM", allow_custom_value=True)
	vlm_model = gr.Dropdown(VLM_CHOICES, value=VLM_CHOICES[0],
	label="Vision model (captions)", allow_custom_value=True)

	content_brief = gr.Textbox(
	label="What the content must cover (optional)",
	lines=4,
	placeholder=("List the points, steps, or questions the tutorial must address — "
	"one per line or comma-separated.\n"
	"e.g. How to create a pivot table\nHow to refresh data\n"
	"Common pivot table errors"),
	info=("Required coverage for the tutorial writer. Leave blank to just follow "
	"the video. Screenshots stay optional — if none are suitable the post is "
	"produced without them."),
	)

	with gr.Accordion("SEO / AEO keywords (optional)", open=False):
	gr.Markdown(
	"The primary keyword is used naturally ~3× in the body and placed in "
	"the title, URL slug, meta description, the first 100 words, and one or two "
	"H2 headings. Each secondary keyword is used once. The post also follows "
	"answer-engine best practices (direct answer up top, FAQ, last-updated date, "
	"source citation)."
	)
	primary_kw = gr.Textbox(label="Primary keyword", placeholder="e.g. excel pivot tables")
	secondary_kw = gr.Textbox(label="Secondary keywords (comma-separated)",
	placeholder="e.g. pivot chart, data summary")

	with gr.Accordion("Advanced settings", open=False):
	with gr.Row():
	w_llm = gr.Slider(0.0, 1.0, value=0.4, step=0.05, label="Weight: LLM timestamp")
	w_whisper = gr.Slider(0.0, 1.0, value=0.6, step=0.05, label="Weight: transcript timing")
	lead = gr.Slider(0.0, 5.0, value=1.0, step=0.5, label="Lead offset (s)")
	max_shots = gr.Slider(1, 15, value=8, step=1, label="Max screenshots")

	with gr.Row():
	health_btn = gr.Button("🩺 Check YouTube access (proxy + reachability)")
	health_md = gr.Markdown()

	run_btn = gr.Button("Generate tutorial", variant="primary")

	status_md = gr.Markdown(label="Status")
	ranking_df = gr.Dataframe(
	headers=["#", "Title", "Positive", "Comments", "Note", "URL"],
	label="Sentiment ranking", interactive=False, wrap=True,
	)
	transcript_box = gr.Textbox(label="Transcript preview", lines=10, max_lines=20)
	docx_file = gr.File(label="Download tutorial (.docx)")

	health_btn.click(check_access, inputs=[yt_proxy], outputs=health_md)
	run_btn.click(
	run_pipeline,
	inputs=[topic, hf_token, yt_api_key, yt_proxy, llm_model, vlm_model,
	w_llm, w_whisper, lead, max_shots, primary_kw, secondary_kw,
	content_brief],
	outputs=[status_md, ranking_df, transcript_box, docx_file],
	)
	return demo


	# Expose a module-level `demo` so HF Spaces' SSR launcher finds it (avoids the
	# "Launching demo not found in __main__" fallback warning).
	demo = build_ui()
	demo.queue()

	if __name__ == "__main__":
	demo.launch()