Spaces:

build-small-hackathon
/

Structured-Data-Rescuer

Running

App Files Files Community

Structured-Data-Rescuer / app.py

TensorVizion

Update app.py

10f2c4b verified 11 days ago

Raw

History Blame Contribute Delete

14.6 kB

	import gradio as gr
	import json
	import os
	import csv
	import tempfile
	from huggingface_hub import InferenceClient

	# Replace this with your exact model repo ID
	MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"

	# Securely load the Hugging Face token from Space secrets
	hf_token = os.environ.get("HF_TOKEN")

	# Initialize the HF inference client with the token
	client = InferenceClient(model=MODEL_ID, token=hf_token)

	# -------------------------
	# Custom CSS Styling
	# -------------------------
	custom_css = """
	.hero-container {
	background: linear-gradient(135deg, #6366f1 0%, #14b8a6 100%);
	padding: 2.5rem;
	border-radius: 20px;
	color: white;
	margin-bottom: 2rem;
	box-shadow: 0 10px 25px -5px rgba(99, 102, 241, 0.2);
	}
	.hero-container h1 {
	color: white !important;
	font-size: 2.5rem !important;
	font-weight: 800 !important;
	margin-bottom: 0.5rem;
	text-shadow: 0 2px 4px rgba(0,0,0,0.1);
	}
	.hero-container p {
	color: rgba(255, 255, 255, 0.9) !important;
	font-size: 1.1rem !important;
	}
	.primary-btn {
	background: linear-gradient(90deg, #6366f1 0%, #14b8a6 100%) !important;
	border: none !important;
	color: white !important;
	font-weight: 600 !important;
	border-radius: 10px !important;
	transition: all 0.3s ease !important;
	padding: 12px 24px !important;
	}
	.primary-btn:hover {
	transform: translateY(-2px);
	box-shadow: 0 8px 20px -5px rgba(99, 102, 241, 0.4);
	}
	.secondary-btn {
	border-radius: 10px !important;
	font-weight: 600 !important;
	}
	.feedback-card {
	border-left: 4px solid #6366f1;
	background-color: rgba(99, 102, 241, 0.05);
	}
	"""

	# -------------------------
	# Helper & Extraction Logic
	# -------------------------
	def generate_kpi_html(structured_data):
	"""Generates modern, responsive KPI metrics cards dynamically based on JSON data."""
	if not structured_data or "error" in structured_data:
	return """
	<div style='display: flex; justify-content: center; align-items: center; height: 100px; border: 2px dashed var(--border-color-primary, #e5e7eb); border-radius: 12px; color: var(--text-color-subdued, #9ca3af);'>
	Await extraction to generate KPI metrics...
	</div>
	"""

	cards_html = ""
	if isinstance(structured_data, dict):
	# Pick the top 4 attributes to show as metrics
	items = list(structured_data.items())[:4]
	for key, val in items:
	# Clean up the key label
	display_key = str(key).replace("_", " ").replace("-", " ").title()

	# Format list value representation
	if isinstance(val, list):
	display_val = ", ".join(map(str, val))
	else:
	display_val = str(val)

	# Truncate if string is too long for the card layout
	if len(display_val) > 40:
	display_val = display_val[:37] + "..."

	# Dynamic highlight accents based on field types
	accent_color = "#6366f1" # default Indigo
	if any(x in display_key.lower() for x in ["price", "total", "amount", "cost", "revenue", "budget"]):
	accent_color = "#10b981" # Emerald for cash/costs
	elif any(x in display_key.lower() for x in ["date", "deadline", "due", "time"]):
	accent_color = "#f59e0b" # Amber for dates/reminders
	elif any(x in display_key.lower() for x in ["status", "priority", "importance"]):
	accent_color = "#ef4444" # Crimson for status/alerts

	cards_html += f"""
	<div style='background: var(--body-background-fill, #ffffff); padding: 1rem; border-radius: 12px; box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.05); border: 1px solid var(--border-color-primary, #e5e7eb); border-left: 5px solid {accent_color}; min-width: 140px; flex: 1;'>
	<div style='font-size: 0.7rem; color: var(--text-color-subdued, #6b7280); text-transform: uppercase; font-weight: 700; letter-spacing: 0.05em; margin-bottom: 0.25rem;'>{display_key}</div>
	<div style='font-size: 1.05rem; color: var(--body-text-color, #111827); font-weight: 800; word-break: break-word;'>{display_val}</div>
	</div>
	"""
	elif isinstance(structured_data, list):
	# Summary KPI for array data structures
	cards_html = f"""
	<div style='background: var(--body-background-fill, #ffffff); padding: 1rem; border-radius: 12px; box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.05); border: 1px solid var(--border-color-primary, #e5e7eb); border-left: 5px solid #6366f1; min-width: 140px; flex: 1;'>
	<div style='font-size: 0.7rem; color: var(--text-color-subdued, #6b7280); text-transform: uppercase; font-weight: 700; letter-spacing: 0.05em; margin-bottom: 0.25rem;'>Total Records Found</div>
	<div style='font-size: 1.5rem; color: var(--body-text-color, #111827); font-weight: 800;'>{len(structured_data)}</div>
	</div>
	"""

	return f"""
	<div style='display: flex; flex-wrap: wrap; gap: 0.75rem; margin-bottom: 1rem; width: 100%;'>
	{cards_html}
	</div>
	"""

	def extract_data(raw_text, fields_to_extract):
	if not hf_token:
	err_state = {"error": "HF_TOKEN secret is missing. Please add your Hugging Face Access Token to the Space Secrets."}
	return err_state, [["Error", "HF_TOKEN missing"]], generate_kpi_html(err_state)

	if not raw_text.strip() or not fields_to_extract.strip():
	err_state = {"error": "Please provide both raw text and fields to extract."}
	return err_state, [["Error", "Incomplete inputs"]], generate_kpi_html(err_state)

	# Construct the system instruction
	system_prompt = (
	"You are an expert data extraction assistant. Your job is to extract specific "
	"information from messy, unstructured text and output it as clean, valid JSON.\n"
	"Rules:\n"
	"1. Only extract the fields requested.\n"
	"2. If a field is not found in the text, return 'null' for that field.\n"
	"3. Output ONLY a raw JSON object. Do not include markdown formatting, backticks, or conversational text."
	)

	user_prompt = f"Fields to extract:\n{fields_to_extract}\n\nUnstructured Text:\n{raw_text}"

	messages = [
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_prompt}
	]

	try:
	# Call the model via the chat completion API
	response = client.chat_completion(
	messages=messages,
	max_tokens=1024,
	temperature=0.1,
	)

	output_text = response.choices[0].message.content.strip()

	# Fallback: Safely strip markdown code blocks without regular expressions
	cleaned_text = output_text
	if cleaned_text.startswith("```"):
	lines = cleaned_text.splitlines()
	if len(lines) >= 2:
	if lines[0].startswith("```"):
	lines = lines[1:]
	if lines and lines[-1].strip() == "```":
	lines = lines[:-1]
	cleaned_text = "\n".join(lines).strip()

	# Parse the text into an actual JSON dictionary
	structured_data = json.loads(cleaned_text)

	# Convert JSON structure to a displayable 2D list for the Table view
	table_data = []
	if isinstance(structured_data, dict):
	for k, v in structured_data.items():
	val_str = ", ".join(map(str, v)) if isinstance(v, list) else str(v)
	table_data.append([k, val_str])
	elif isinstance(structured_data, list):
	for idx, item in enumerate(structured_data):
	table_data.append([f"Item {idx + 1}", str(item)])

	return structured_data, table_data, generate_kpi_html(structured_data)

	except json.JSONDecodeError:
	error_dict = {
	"error": "The model failed to return valid JSON. It returned this instead:",
	"raw_output": output_text
	}
	return error_dict, [["Error", "Invalid JSON parsed"]], generate_kpi_html(error_dict)
	except Exception as e:
	error_msg = str(e)
	if "model_not_found" in error_msg or "does not exist" in error_msg:
	err_dict = {
	"error": f"The model '{MODEL_ID}' was not found on Hugging Face.",
	"troubleshooting": [
	"1. Check your Hugging Face repo for typos (case-sensitive).",
	"2. Verify HF_TOKEN secret read permissions.",
	"3. GGUF or LoRA adapter models are not directly supported by the Serverless API."
	]
	}
	return err_dict, [["Connection Error", "Model Not Found"]], generate_kpi_html(err_dict)
	err_state = {"error": error_msg}
	return err_state, [["Error", error_msg]], generate_kpi_html(err_state)

	def generate_csv(json_data):
	"""Converts the JSON output into a downloadable CSV file."""
	if not json_data or "error" in json_data:
	return None

	if isinstance(json_data, dict):
	data_list = [json_data]
	elif isinstance(json_data, list):
	data_list = json_data
	else:
	return None

	# Create a secure temporary file to hold the CSV
	temp_dir = tempfile.mkdtemp()
	csv_path = os.path.join(temp_dir, "extracted_data.csv")

	try:
	with open(csv_path, 'w', newline='', encoding='utf-8') as f:
	headers = set()
	for item in data_list:
	if isinstance(item, dict):
	headers.update(item.keys())
	headers = list(headers)

	if not headers:
	return None

	writer = csv.DictWriter(f, fieldnames=headers)
	writer.writeheader()

	for item in data_list:
	if isinstance(item, dict):
	flat_item = {k: (str(v) if isinstance(v, (list, dict)) else v) for k, v in item.items()}
	writer.writerow(flat_item)

	return csv_path
	except Exception as e:
	return None

	# -------------------------
	# Build the Gradio UI
	# -------------------------
	with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo:

	# Styled Header Block
	with gr.HTML(elem_classes="hero-container"):
	gr.Markdown(
	f"""
	# 🛟 The Data Rescuer
	Turn messy logs, disorganized lists, automated transcripts, and raw OCR scripts into highly structured business-ready assets — powered by `{MODEL_ID}`.
	"""
	)

	with gr.Row():
	# Left Column: Inputs
	with gr.Column(scale=1):
	raw_input = gr.Textbox(
	label="1. Paste Unstructured Text",
	placeholder="Paste your messy meeting notes, emails, or raw text here...",
	lines=12
	)

	schema_input = gr.Textbox(
	label="2. What fields do you want to extract?",
	placeholder="e.g., Company Name, Contact Person, Deadline, Action Items (list)",
	lines=3
	)

	extract_btn = gr.Button("🚀 Extract Structured Data", variant="primary", elem_classes="primary-btn")

	# Right Column: Multi-view Output Panels
	with gr.Column(scale=1):
	# Dynamic HTML summary cards (Dashboard metrics style)
	kpi_output = gr.HTML(
	value="""
	<div style='display: flex; justify-content: center; align-items: center; height: 100px; border: 2px dashed var(--border-color-primary, #e5e7eb); border-radius: 12px; color: var(--text-color-subdued, #9ca3af);'>
	Await extraction to generate KPI metrics...
	</div>
	"""
	)

	with gr.Tabs():
	with gr.TabItem("📊 Structured Table"):
	table_output = gr.Dataframe(
	headers=["Field Name", "Extracted Value"],
	datatype=["str", "str"],
	interactive=False,
	wrap=True
	)
	with gr.TabItem("🔍 Raw JSON Tree"):
	json_output = gr.JSON(label="JSON Object")

	# Action controls below outputs
	with gr.Row():
	export_btn = gr.Button("💾 Build Export File", variant="secondary", elem_classes="secondary-btn")
	csv_output = gr.File(label="Ready for Download", interactive=False)

	# -------------------------
	# Examples Panel
	# -------------------------
	gr.Markdown("### Try it out with these examples:")
	gr.Examples(
	examples=[
	[
	"Hey guys, quick recap of today's sync. Sarah is going to handle the frontend React components by next Tuesday. John, you need to fix the database migration issue before Friday. Also, our client 'Acme Corp' wants the final delivery by October 15th.",
	"Task Owner, Task Description, Deadline, Client Name"
	],
	[
	"Invoice #99214. From: BlueTech Software. To: Jane Doe. Items: 1x Server Maintenance ($500), 2x Cloud Storage ($100 each). Total due: $700. Please pay by end of month.",
	"Invoice Number, Sender, Recipient, Items (list of names and prices), Total Amount"
	]
	],
	inputs=[raw_input, schema_input],
	label="Click an example to populate the inputs"
	)

	# -------------------------
	# Event Connections
	# -------------------------
	# 1. Connect extraction button to the Table View, JSON Tree, and KPI output
	extract_btn.click(
	fn=extract_data,
	inputs=[raw_input, schema_input],
	outputs=[json_output, table_output, kpi_output]
	)

	# 2. Connect CSV generation
	export_btn.click(
	fn=generate_csv,
	inputs=[json_output],
	outputs=[csv_output]
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch()