Spaces:
Sleeping
Sleeping
import os | |
import gradio as gr | |
from sqlalchemy import text | |
from smolagents import CodeAgent, HfApiModel | |
import pandas as pd | |
from io import StringIO | |
import tempfile | |
from datetime import datetime | |
from database import ( | |
engine, | |
create_dynamic_table, | |
clear_database, | |
insert_rows_into_table | |
) | |
agent = CodeAgent( | |
tools=[], | |
model=HfApiModel(model_id="Qwen/Qwen2.5-Coder-32B-Instruct"), | |
) | |
def analyze_content(full_text): | |
"""Determine document type and key themes""" | |
analysis_prompt = f""" | |
Analyze this text and identify its primary domain: | |
{full_text[:10000]} # First 10k characters for analysis | |
Possible domains: | |
- Business/Financial | |
- Historical | |
- Scientific | |
- Technical | |
- Legal | |
- Literary | |
Return JSON format: | |
{{ | |
"domain": "primary domain", | |
"keywords": ["list", "of", "key", "terms"], | |
"report_type": "business|historical|scientific|technical|legal|literary" | |
}} | |
""" | |
return agent.run(analysis_prompt, output_type="json") | |
def generate_report(full_text, domain, file_names): | |
"""Generate domain-specific report""" | |
report_prompt = f""" | |
Create a comprehensive {domain} report from these documents: | |
Files: {', '.join(file_names)} | |
Content: | |
{full_text[:20000]} # First 20k chars for report | |
Report structure: | |
1. Executive Summary | |
2. Key Findings/Analysis | |
3. Important Metrics/Statistics (if applicable) | |
4. Timeline of Events (historical) or Financial Overview (business) | |
5. Conclusions/Recommendations | |
Include markdown formatting with headings, bullet points, and tables where appropriate. | |
""" | |
return agent.run(report_prompt) | |
def process_files(file_paths): | |
"""Process multiple files and generate report""" | |
full_text = "" | |
file_names = [] | |
structured_data = [] | |
for file_path in file_paths: | |
try: | |
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: | |
content = f.read() | |
full_text += f"\n\n--- {os.path.basename(file_path)} ---\n{content}" | |
file_names.append(os.path.basename(file_path)) | |
# Structure detection for tables | |
structure_prompt = f"Convert to CSV:\n{content}\nReturn ONLY CSV:" | |
csv_output = agent.run(structure_prompt) | |
df = pd.read_csv(StringIO(csv_output), dtype=str).dropna(how='all') | |
structured_data.append(df) | |
except Exception as e: | |
print(f"Error processing {file_path}: {str(e)}") | |
# Domain analysis | |
domain_info = analyze_content(full_text) | |
# Report generation | |
report = generate_report(full_text, domain_info["report_type"], file_names) | |
# Combine structured data | |
combined_df = pd.concat(structured_data, ignore_index=True) if structured_data else pd.DataFrame() | |
return domain_info, report, combined_df | |
def handle_upload(files): | |
"""Handle multiple file uploads""" | |
if not files: | |
return [gr.update()]*6 + [gr.update(visible=False)] | |
domain_info, report, df = process_files(files) | |
outputs = [ | |
gr.Markdown(value=f"**Document Type:** {domain_info['domain']}"), | |
gr.Markdown(value=f"**Key Themes:** {', '.join(domain_info['keywords'][:5])}"), | |
gr.Dataframe(value=df.head(10) if not df.empty else None), | |
gr.Markdown(value=report), | |
gr.update(visible=True), | |
gr.update(visible=True), | |
gr.update(visible=not df.empty) | |
] | |
return outputs | |
def download_report(report_type): | |
"""Generate downloadable reports""" | |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
filename = f"{report_type}_report_{timestamp}" | |
temp_dir = tempfile.gettempdir() | |
formats = { | |
'pdf': f"{filename}.pdf", | |
'docx': f"{filename}.docx", | |
'csv': f"{filename}.csv" | |
} | |
# Generate files (implementation depends on your PDF/DOCX libraries) | |
# Add your preferred reporting libraries here | |
return [os.path.join(temp_dir, f) for f in formats.values()] | |
with gr.Blocks() as demo: | |
gr.Markdown("# Multi-Document Analysis System") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
file_input = gr.File( | |
label="Upload Documents", | |
file_count="multiple", | |
file_types=[".txt", ".doc", ".docx"], | |
type="filepath" | |
) | |
process_btn = gr.Button("Analyze Documents", variant="primary") | |
with gr.Group(visible=False) as meta_group: | |
domain_display = gr.Markdown() | |
keywords_display = gr.Markdown() | |
with gr.Column(scale=2): | |
with gr.Tabs(): | |
with gr.TabItem("Structured Data"): | |
data_table = gr.Dataframe(label="Combined Data Preview", interactive=False) | |
with gr.TabItem("Analysis Report"): | |
report_display = gr.Markdown() | |
with gr.Group(visible=False) as download_group: | |
gr.Markdown("### Download Options") | |
with gr.Row(): | |
pdf_btn = gr.DownloadButton("PDF Report") | |
docx_btn = gr.DownloadButton("Word Report") | |
csv_btn = gr.DownloadButton("CSV Data") | |
process_btn.click( | |
fn=handle_upload, | |
inputs=file_input, | |
outputs=[ | |
domain_display, | |
keywords_display, | |
data_table, | |
report_display, | |
meta_group, | |
download_group, | |
csv_btn | |
] | |
) | |
# Connect download buttons (implement actual file generation) | |
# pdf_btn.click(fn=lambda: download_report("pdf"), outputs=pdf_btn) | |
# docx_btn.click(fn=lambda: download_report("docx"), outputs=docx_btn) | |
# csv_btn.click(fn=lambda: download_report("csv"), outputs=csv_btn) | |
if __name__ == "__main__": | |
demo.launch(server_name="0.0.0.0", server_port=7860) |