Quazim0t0 commited on
Commit
a808dce
·
verified ·
1 Parent(s): 31ed30a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +148 -185
app.py CHANGED
@@ -5,6 +5,7 @@ from smolagents import CodeAgent, HfApiModel
5
  import pandas as pd
6
  from io import StringIO
7
  import tempfile
 
8
  from database import (
9
  engine,
10
  create_dynamic_table,
@@ -12,211 +13,173 @@ from database import (
12
  insert_rows_into_table
13
  )
14
 
15
- # Initialize the AI agent
16
  agent = CodeAgent(
17
  tools=[],
18
  model=HfApiModel(model_id="Qwen/Qwen2.5-Coder-32B-Instruct"),
19
  )
20
 
21
- def get_data_table():
22
- """Fetch and return the current table data as DataFrame"""
23
- try:
24
- with engine.connect() as con:
25
- tables = con.execute(text(
26
- "SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'"
27
- )).fetchall()
28
-
29
- if not tables:
30
- return pd.DataFrame()
31
-
32
- table_name = tables[0][0]
33
-
34
- with engine.connect() as con:
35
- result = con.execute(text(f"SELECT * FROM {table_name}"))
36
- rows = result.fetchall()
37
- columns = result.keys()
38
-
39
- return pd.DataFrame(rows, columns=columns) if rows else pd.DataFrame()
40
-
41
- except Exception as e:
42
- return pd.DataFrame({"Error": [str(e)]})
43
 
44
- def process_txt_file(file_path):
45
- """Analyze text file and convert to structured table"""
46
- try:
47
- with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
48
- content = f.read()
49
-
50
- structure_prompt = f"""
51
- Convert this text into valid CSV format:
52
- {content}
53
-
54
- Requirements:
55
- 1. First row must be headers
56
- 2. Consistent columns per row
57
- 3. Quote fields containing commas
58
- 4. Maintain original data relationships
59
-
60
- Return ONLY the CSV content.
61
- """
62
- csv_output = agent.run(structure_prompt)
63
-
64
- try:
65
- df = pd.read_csv(
66
- StringIO(csv_output),
67
- on_bad_lines='warn',
68
- dtype=str,
69
- encoding_errors='ignore'
70
- ).dropna(how='all')
71
- except pd.errors.ParserError as pe:
72
- return False, f"CSV Parsing Error: {str(pe)}", pd.DataFrame()
73
 
74
- if df.empty or len(df.columns) == 0:
75
- return False, "No structured data found", pd.DataFrame()
 
 
 
76
 
77
- clear_database()
78
- table = create_dynamic_table(df)
79
- insert_rows_into_table(df.to_dict('records'), table)
80
-
81
- return True, "Text analyzed successfully!", df.head(10)
 
 
 
 
 
 
 
 
 
 
82
 
83
- except Exception as e:
84
- return False, f"Processing error: {str(e)}", pd.DataFrame()
 
 
 
 
 
 
 
 
85
 
86
- def handle_upload(file_obj):
87
- """Handle file upload and processing"""
88
- if file_obj is None:
89
- return [
90
- "Please upload a text file.",
91
- None,
92
- "No schema",
93
- gr.update(visible=True),
94
- gr.update(visible=False),
95
- gr.update(visible=False)
96
- ]
97
 
98
- success, message, df = process_txt_file(file_obj)
99
- if success:
100
- schema = "\n".join([f"- {col} (text)" for col in df.columns])
101
- return [
102
- message,
103
- df,
104
- f"### Detected Schema:\n```\n{schema}\n```",
105
- gr.update(visible=False),
106
- gr.update(visible=True),
107
- gr.update(visible=True)
108
- ]
109
- return [
110
- message,
111
- None,
112
- "No schema",
113
- gr.update(visible=True),
114
- gr.update(visible=False),
115
- gr.update(visible=False)
116
  ]
 
 
117
 
118
- def query_analysis(user_query: str) -> str:
119
- """Handle natural language queries about the data"""
120
- try:
121
- df = get_data_table()
122
- if df.empty:
123
- return "Please upload and process a file first."
124
-
125
- analysis_prompt = f"""
126
- Analyze this data:
127
- {df.head().to_csv()}
128
-
129
- Question: {user_query}
130
-
131
- Provide:
132
- 1. Direct answer
133
- 2. Numerical formatting
134
- 3. Data references
135
-
136
- Use Markdown formatting.
137
- """
138
-
139
- return agent.run(analysis_prompt)
140
-
141
- except Exception as e:
142
- return f"Query error: {str(e)}"
143
-
144
- def download_csv():
145
- """Generate CSV file for download"""
146
- df = get_data_table()
147
- if not df.empty:
148
- temp_dir = tempfile.gettempdir()
149
- file_path = os.path.join(temp_dir, "processed_data.csv")
150
- df.to_csv(file_path, index=False)
151
- return file_path
152
- return None
153
 
154
- # Gradio interface setup
155
  with gr.Blocks() as demo:
156
- with gr.Group() as upload_group:
157
- gr.Markdown("""
158
- # Text Data Analyzer
159
- Upload unstructured text files to analyze and query their data
160
- """)
161
- file_input = gr.File(
162
- label="Upload Text File",
163
- file_types=[".txt"],
164
- type="filepath"
165
- )
166
- status = gr.Textbox(label="Processing Status", interactive=False)
167
-
168
- with gr.Group(visible=False) as query_group:
169
- with gr.Row():
170
- with gr.Column(scale=1):
 
 
 
 
 
 
 
 
 
 
 
171
  with gr.Row():
172
- user_input = gr.Textbox(label="Ask about the data", scale=4)
173
- submit_btn = gr.Button("Submit", scale=1)
174
- query_output = gr.Markdown(label="Analysis Results")
175
- with gr.Column(scale=2):
176
- gr.Markdown("### Extracted Data Preview")
177
- data_table = gr.Dataframe(
178
- label="Structured Data",
179
- interactive=False
180
- )
181
- download_btn = gr.DownloadButton(
182
- "Download as CSV",
183
- visible=False
184
- )
185
- schema_display = gr.Markdown()
186
- refresh_btn = gr.Button("Refresh View")
187
 
188
- # Event handlers
189
- file_input.upload(
190
  fn=handle_upload,
191
  inputs=file_input,
192
- outputs=[status, data_table, schema_display, upload_group, query_group, download_btn]
193
- )
194
-
195
- submit_btn.click(
196
- fn=query_analysis,
197
- inputs=user_input,
198
- outputs=query_output
199
- )
200
-
201
- user_input.submit(
202
- fn=query_analysis,
203
- inputs=user_input,
204
- outputs=query_output
205
- )
206
-
207
- refresh_btn.click(
208
- fn=lambda: (get_data_table().head(10), "Schema refreshed"),
209
- outputs=[data_table, schema_display]
210
  )
211
 
212
- download_btn.click(
213
- fn=download_csv,
214
- outputs=download_btn
215
- )
216
 
217
  if __name__ == "__main__":
218
- demo.launch(
219
- server_name="0.0.0.0",
220
- server_port=7860,
221
- show_error=True
222
- )
 
5
  import pandas as pd
6
  from io import StringIO
7
  import tempfile
8
+ from datetime import datetime
9
  from database import (
10
  engine,
11
  create_dynamic_table,
 
13
  insert_rows_into_table
14
  )
15
 
 
16
  agent = CodeAgent(
17
  tools=[],
18
  model=HfApiModel(model_id="Qwen/Qwen2.5-Coder-32B-Instruct"),
19
  )
20
 
21
+ def analyze_content(full_text):
22
+ """Determine document type and key themes"""
23
+ analysis_prompt = f"""
24
+ Analyze this text and identify its primary domain:
25
+ {full_text[:10000]} # First 10k characters for analysis
26
+
27
+ Possible domains:
28
+ - Business/Financial
29
+ - Historical
30
+ - Scientific
31
+ - Technical
32
+ - Legal
33
+ - Literary
34
+
35
+ Return JSON format:
36
+ {{
37
+ "domain": "primary domain",
38
+ "keywords": ["list", "of", "key", "terms"],
39
+ "report_type": "business|historical|scientific|technical|legal|literary"
40
+ }}
41
+ """
42
+ return agent.run(analysis_prompt, output_type="json")
43
 
44
+ def generate_report(full_text, domain, file_names):
45
+ """Generate domain-specific report"""
46
+ report_prompt = f"""
47
+ Create a comprehensive {domain} report from these documents:
48
+ Files: {', '.join(file_names)}
49
+
50
+ Content:
51
+ {full_text[:20000]} # First 20k chars for report
52
+
53
+ Report structure:
54
+ 1. Executive Summary
55
+ 2. Key Findings/Analysis
56
+ 3. Important Metrics/Statistics (if applicable)
57
+ 4. Timeline of Events (historical) or Financial Overview (business)
58
+ 5. Conclusions/Recommendations
59
+
60
+ Include markdown formatting with headings, bullet points, and tables where appropriate.
61
+ """
62
+ return agent.run(report_prompt)
 
 
 
 
 
 
 
 
 
 
63
 
64
+ def process_files(file_paths):
65
+ """Process multiple files and generate report"""
66
+ full_text = ""
67
+ file_names = []
68
+ structured_data = []
69
 
70
+ for file_path in file_paths:
71
+ try:
72
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
73
+ content = f.read()
74
+ full_text += f"\n\n--- {os.path.basename(file_path)} ---\n{content}"
75
+ file_names.append(os.path.basename(file_path))
76
+
77
+ # Structure detection for tables
78
+ structure_prompt = f"Convert to CSV:\n{content}\nReturn ONLY CSV:"
79
+ csv_output = agent.run(structure_prompt)
80
+ df = pd.read_csv(StringIO(csv_output), dtype=str).dropna(how='all')
81
+ structured_data.append(df)
82
+
83
+ except Exception as e:
84
+ print(f"Error processing {file_path}: {str(e)}")
85
 
86
+ # Domain analysis
87
+ domain_info = analyze_content(full_text)
88
+
89
+ # Report generation
90
+ report = generate_report(full_text, domain_info["report_type"], file_names)
91
+
92
+ # Combine structured data
93
+ combined_df = pd.concat(structured_data, ignore_index=True) if structured_data else pd.DataFrame()
94
+
95
+ return domain_info, report, combined_df
96
 
97
+ def handle_upload(files):
98
+ """Handle multiple file uploads"""
99
+ if not files:
100
+ return [gr.update()]*6 + [gr.update(visible=False)]
 
 
 
 
 
 
 
101
 
102
+ domain_info, report, df = process_files(files)
103
+
104
+ outputs = [
105
+ gr.Markdown(value=f"**Document Type:** {domain_info['domain']}"),
106
+ gr.Markdown(value=f"**Key Themes:** {', '.join(domain_info['keywords'][:5])}"),
107
+ gr.Dataframe(value=df.head(10) if not df.empty else None),
108
+ gr.Markdown(value=report),
109
+ gr.update(visible=True),
110
+ gr.update(visible=True),
111
+ gr.update(visible=not df.empty)
 
 
 
 
 
 
 
 
112
  ]
113
+
114
+ return outputs
115
 
116
+ def download_report(report_type):
117
+ """Generate downloadable reports"""
118
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
119
+ filename = f"{report_type}_report_{timestamp}"
120
+
121
+ temp_dir = tempfile.gettempdir()
122
+ formats = {
123
+ 'pdf': f"{filename}.pdf",
124
+ 'docx': f"{filename}.docx",
125
+ 'csv': f"{filename}.csv"
126
+ }
127
+
128
+ # Generate files (implementation depends on your PDF/DOCX libraries)
129
+ # Add your preferred reporting libraries here
130
+
131
+ return [os.path.join(temp_dir, f) for f in formats.values()]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
 
133
  with gr.Blocks() as demo:
134
+ gr.Markdown("# Multi-Document Analysis System")
135
+
136
+ with gr.Row():
137
+ with gr.Column(scale=1):
138
+ file_input = gr.File(
139
+ label="Upload Documents",
140
+ file_count="multiple",
141
+ file_types=[".txt", ".doc", ".docx"],
142
+ type="filepath"
143
+ )
144
+ process_btn = gr.Button("Analyze Documents", variant="primary")
145
+
146
+ with gr.Group(visible=False) as meta_group:
147
+ domain_display = gr.Markdown()
148
+ keywords_display = gr.Markdown()
149
+
150
+ with gr.Column(scale=2):
151
+ with gr.Tabs():
152
+ with gr.TabItem("Structured Data"):
153
+ data_table = gr.Dataframe(label="Combined Data Preview", interactive=False)
154
+
155
+ with gr.TabItem("Analysis Report"):
156
+ report_display = gr.Markdown()
157
+
158
+ with gr.Group(visible=False) as download_group:
159
+ gr.Markdown("### Download Options")
160
  with gr.Row():
161
+ pdf_btn = gr.DownloadButton("PDF Report")
162
+ docx_btn = gr.DownloadButton("Word Report")
163
+ csv_btn = gr.DownloadButton("CSV Data")
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
+ process_btn.click(
 
166
  fn=handle_upload,
167
  inputs=file_input,
168
+ outputs=[
169
+ domain_display,
170
+ keywords_display,
171
+ data_table,
172
+ report_display,
173
+ meta_group,
174
+ download_group,
175
+ csv_btn
176
+ ]
 
 
 
 
 
 
 
 
 
177
  )
178
 
179
+ # Connect download buttons (implement actual file generation)
180
+ # pdf_btn.click(fn=lambda: download_report("pdf"), outputs=pdf_btn)
181
+ # docx_btn.click(fn=lambda: download_report("docx"), outputs=docx_btn)
182
+ # csv_btn.click(fn=lambda: download_report("csv"), outputs=csv_btn)
183
 
184
  if __name__ == "__main__":
185
+ demo.launch(server_name="0.0.0.0", server_port=7860)