| import gradio as gr
|
| import json
|
| import os
|
| import csv
|
| import tempfile
|
| from huggingface_hub import InferenceClient
|
|
|
|
|
| MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
|
|
|
|
|
| hf_token = os.environ.get("HF_TOKEN")
|
|
|
|
|
| client = InferenceClient(model=MODEL_ID, token=hf_token)
|
|
|
|
|
|
|
|
|
| custom_css = """
|
| .hero-container {
|
| background: linear-gradient(135deg, #6366f1 0%, #14b8a6 100%);
|
| padding: 2.5rem;
|
| border-radius: 20px;
|
| color: white;
|
| margin-bottom: 2rem;
|
| box-shadow: 0 10px 25px -5px rgba(99, 102, 241, 0.2);
|
| }
|
| .hero-container h1 {
|
| color: white !important;
|
| font-size: 2.5rem !important;
|
| font-weight: 800 !important;
|
| margin-bottom: 0.5rem;
|
| text-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
| }
|
| .hero-container p {
|
| color: rgba(255, 255, 255, 0.9) !important;
|
| font-size: 1.1rem !important;
|
| }
|
| .primary-btn {
|
| background: linear-gradient(90deg, #6366f1 0%, #14b8a6 100%) !important;
|
| border: none !important;
|
| color: white !important;
|
| font-weight: 600 !important;
|
| border-radius: 10px !important;
|
| transition: all 0.3s ease !important;
|
| padding: 12px 24px !important;
|
| }
|
| .primary-btn:hover {
|
| transform: translateY(-2px);
|
| box-shadow: 0 8px 20px -5px rgba(99, 102, 241, 0.4);
|
| }
|
| .secondary-btn {
|
| border-radius: 10px !important;
|
| font-weight: 600 !important;
|
| }
|
| .feedback-card {
|
| border-left: 4px solid #6366f1;
|
| background-color: rgba(99, 102, 241, 0.05);
|
| }
|
| """
|
|
|
|
|
|
|
|
|
| def generate_kpi_html(structured_data):
|
| """Generates modern, responsive KPI metrics cards dynamically based on JSON data."""
|
| if not structured_data or "error" in structured_data:
|
| return """
|
| <div style='display: flex; justify-content: center; align-items: center; height: 100px; border: 2px dashed var(--border-color-primary, #e5e7eb); border-radius: 12px; color: var(--text-color-subdued, #9ca3af);'>
|
| Await extraction to generate KPI metrics...
|
| </div>
|
| """
|
|
|
| cards_html = ""
|
| if isinstance(structured_data, dict):
|
|
|
| items = list(structured_data.items())[:4]
|
| for key, val in items:
|
|
|
| display_key = str(key).replace("_", " ").replace("-", " ").title()
|
|
|
|
|
| if isinstance(val, list):
|
| display_val = ", ".join(map(str, val))
|
| else:
|
| display_val = str(val)
|
|
|
|
|
| if len(display_val) > 40:
|
| display_val = display_val[:37] + "..."
|
|
|
|
|
| accent_color = "#6366f1"
|
| if any(x in display_key.lower() for x in ["price", "total", "amount", "cost", "revenue", "budget"]):
|
| accent_color = "#10b981"
|
| elif any(x in display_key.lower() for x in ["date", "deadline", "due", "time"]):
|
| accent_color = "#f59e0b"
|
| elif any(x in display_key.lower() for x in ["status", "priority", "importance"]):
|
| accent_color = "#ef4444"
|
|
|
| cards_html += f"""
|
| <div style='background: var(--body-background-fill, #ffffff); padding: 1rem; border-radius: 12px; box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.05); border: 1px solid var(--border-color-primary, #e5e7eb); border-left: 5px solid {accent_color}; min-width: 140px; flex: 1;'>
|
| <div style='font-size: 0.7rem; color: var(--text-color-subdued, #6b7280); text-transform: uppercase; font-weight: 700; letter-spacing: 0.05em; margin-bottom: 0.25rem;'>{display_key}</div>
|
| <div style='font-size: 1.05rem; color: var(--body-text-color, #111827); font-weight: 800; word-break: break-word;'>{display_val}</div>
|
| </div>
|
| """
|
| elif isinstance(structured_data, list):
|
|
|
| cards_html = f"""
|
| <div style='background: var(--body-background-fill, #ffffff); padding: 1rem; border-radius: 12px; box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.05); border: 1px solid var(--border-color-primary, #e5e7eb); border-left: 5px solid #6366f1; min-width: 140px; flex: 1;'>
|
| <div style='font-size: 0.7rem; color: var(--text-color-subdued, #6b7280); text-transform: uppercase; font-weight: 700; letter-spacing: 0.05em; margin-bottom: 0.25rem;'>Total Records Found</div>
|
| <div style='font-size: 1.5rem; color: var(--body-text-color, #111827); font-weight: 800;'>{len(structured_data)}</div>
|
| </div>
|
| """
|
|
|
| return f"""
|
| <div style='display: flex; flex-wrap: wrap; gap: 0.75rem; margin-bottom: 1rem; width: 100%;'>
|
| {cards_html}
|
| </div>
|
| """
|
|
|
| def extract_data(raw_text, fields_to_extract):
|
| if not hf_token:
|
| err_state = {"error": "HF_TOKEN secret is missing. Please add your Hugging Face Access Token to the Space Secrets."}
|
| return err_state, [["Error", "HF_TOKEN missing"]], generate_kpi_html(err_state)
|
|
|
| if not raw_text.strip() or not fields_to_extract.strip():
|
| err_state = {"error": "Please provide both raw text and fields to extract."}
|
| return err_state, [["Error", "Incomplete inputs"]], generate_kpi_html(err_state)
|
|
|
|
|
| system_prompt = (
|
| "You are an expert data extraction assistant. Your job is to extract specific "
|
| "information from messy, unstructured text and output it as clean, valid JSON.\n"
|
| "Rules:\n"
|
| "1. Only extract the fields requested.\n"
|
| "2. If a field is not found in the text, return 'null' for that field.\n"
|
| "3. Output ONLY a raw JSON object. Do not include markdown formatting, backticks, or conversational text."
|
| )
|
|
|
| user_prompt = f"Fields to extract:\n{fields_to_extract}\n\nUnstructured Text:\n{raw_text}"
|
|
|
| messages = [
|
| {"role": "system", "content": system_prompt},
|
| {"role": "user", "content": user_prompt}
|
| ]
|
|
|
| try:
|
|
|
| response = client.chat_completion(
|
| messages=messages,
|
| max_tokens=1024,
|
| temperature=0.1,
|
| )
|
|
|
| output_text = response.choices[0].message.content.strip()
|
|
|
|
|
| cleaned_text = output_text
|
| if cleaned_text.startswith("```"):
|
| lines = cleaned_text.splitlines()
|
| if len(lines) >= 2:
|
| if lines[0].startswith("```"):
|
| lines = lines[1:]
|
| if lines and lines[-1].strip() == "```":
|
| lines = lines[:-1]
|
| cleaned_text = "\n".join(lines).strip()
|
|
|
|
|
| structured_data = json.loads(cleaned_text)
|
|
|
|
|
| table_data = []
|
| if isinstance(structured_data, dict):
|
| for k, v in structured_data.items():
|
| val_str = ", ".join(map(str, v)) if isinstance(v, list) else str(v)
|
| table_data.append([k, val_str])
|
| elif isinstance(structured_data, list):
|
| for idx, item in enumerate(structured_data):
|
| table_data.append([f"Item {idx + 1}", str(item)])
|
|
|
| return structured_data, table_data, generate_kpi_html(structured_data)
|
|
|
| except json.JSONDecodeError:
|
| error_dict = {
|
| "error": "The model failed to return valid JSON. It returned this instead:",
|
| "raw_output": output_text
|
| }
|
| return error_dict, [["Error", "Invalid JSON parsed"]], generate_kpi_html(error_dict)
|
| except Exception as e:
|
| error_msg = str(e)
|
| if "model_not_found" in error_msg or "does not exist" in error_msg:
|
| err_dict = {
|
| "error": f"The model '{MODEL_ID}' was not found on Hugging Face.",
|
| "troubleshooting": [
|
| "1. Check your Hugging Face repo for typos (case-sensitive).",
|
| "2. Verify HF_TOKEN secret read permissions.",
|
| "3. GGUF or LoRA adapter models are not directly supported by the Serverless API."
|
| ]
|
| }
|
| return err_dict, [["Connection Error", "Model Not Found"]], generate_kpi_html(err_dict)
|
| err_state = {"error": error_msg}
|
| return err_state, [["Error", error_msg]], generate_kpi_html(err_state)
|
|
|
| def generate_csv(json_data):
|
| """Converts the JSON output into a downloadable CSV file."""
|
| if not json_data or "error" in json_data:
|
| return None
|
|
|
| if isinstance(json_data, dict):
|
| data_list = [json_data]
|
| elif isinstance(json_data, list):
|
| data_list = json_data
|
| else:
|
| return None
|
|
|
|
|
| temp_dir = tempfile.mkdtemp()
|
| csv_path = os.path.join(temp_dir, "extracted_data.csv")
|
|
|
| try:
|
| with open(csv_path, 'w', newline='', encoding='utf-8') as f:
|
| headers = set()
|
| for item in data_list:
|
| if isinstance(item, dict):
|
| headers.update(item.keys())
|
| headers = list(headers)
|
|
|
| if not headers:
|
| return None
|
|
|
| writer = csv.DictWriter(f, fieldnames=headers)
|
| writer.writeheader()
|
|
|
| for item in data_list:
|
| if isinstance(item, dict):
|
| flat_item = {k: (str(v) if isinstance(v, (list, dict)) else v) for k, v in item.items()}
|
| writer.writerow(flat_item)
|
|
|
| return csv_path
|
| except Exception as e:
|
| return None
|
|
|
|
|
|
|
|
|
| with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo:
|
|
|
|
|
| with gr.HTML(elem_classes="hero-container"):
|
| gr.Markdown(
|
| f"""
|
| # π The Data Rescuer
|
| Turn messy logs, disorganized lists, automated transcripts, and raw OCR scripts into highly structured business-ready assets β powered by `{MODEL_ID}`.
|
| """
|
| )
|
|
|
| with gr.Row():
|
|
|
| with gr.Column(scale=1):
|
| raw_input = gr.Textbox(
|
| label="1. Paste Unstructured Text",
|
| placeholder="Paste your messy meeting notes, emails, or raw text here...",
|
| lines=12
|
| )
|
|
|
| schema_input = gr.Textbox(
|
| label="2. What fields do you want to extract?",
|
| placeholder="e.g., Company Name, Contact Person, Deadline, Action Items (list)",
|
| lines=3
|
| )
|
|
|
| extract_btn = gr.Button("π Extract Structured Data", variant="primary", elem_classes="primary-btn")
|
|
|
|
|
| with gr.Column(scale=1):
|
|
|
| kpi_output = gr.HTML(
|
| value="""
|
| <div style='display: flex; justify-content: center; align-items: center; height: 100px; border: 2px dashed var(--border-color-primary, #e5e7eb); border-radius: 12px; color: var(--text-color-subdued, #9ca3af);'>
|
| Await extraction to generate KPI metrics...
|
| </div>
|
| """
|
| )
|
|
|
| with gr.Tabs():
|
| with gr.TabItem("π Structured Table"):
|
| table_output = gr.Dataframe(
|
| headers=["Field Name", "Extracted Value"],
|
| datatype=["str", "str"],
|
| interactive=False,
|
| wrap=True
|
| )
|
| with gr.TabItem("π Raw JSON Tree"):
|
| json_output = gr.JSON(label="JSON Object")
|
|
|
|
|
| with gr.Row():
|
| export_btn = gr.Button("πΎ Build Export File", variant="secondary", elem_classes="secondary-btn")
|
| csv_output = gr.File(label="Ready for Download", interactive=False)
|
|
|
|
|
|
|
|
|
| gr.Markdown("### Try it out with these examples:")
|
| gr.Examples(
|
| examples=[
|
| [
|
| "Hey guys, quick recap of today's sync. Sarah is going to handle the frontend React components by next Tuesday. John, you need to fix the database migration issue before Friday. Also, our client 'Acme Corp' wants the final delivery by October 15th.",
|
| "Task Owner, Task Description, Deadline, Client Name"
|
| ],
|
| [
|
| "Invoice #99214. From: BlueTech Software. To: Jane Doe. Items: 1x Server Maintenance ($500), 2x Cloud Storage ($100 each). Total due: $700. Please pay by end of month.",
|
| "Invoice Number, Sender, Recipient, Items (list of names and prices), Total Amount"
|
| ]
|
| ],
|
| inputs=[raw_input, schema_input],
|
| label="Click an example to populate the inputs"
|
| )
|
|
|
|
|
|
|
|
|
|
|
| extract_btn.click(
|
| fn=extract_data,
|
| inputs=[raw_input, schema_input],
|
| outputs=[json_output, table_output, kpi_output]
|
| )
|
|
|
|
|
| export_btn.click(
|
| fn=generate_csv,
|
| inputs=[json_output],
|
| outputs=[csv_output]
|
| )
|
|
|
|
|
| if __name__ == "__main__":
|
| demo.launch()
|
|
|