File size: 3,302 Bytes
b59ab77 5d3757f 11bb884 5d3757f b59ab77 033e470 09d42b9 07a4d78 09d42b9 69fb57b 09d42b9 876505c 5d3757f 69fb57b 095e948 876505c 095e948 033e470 11bb884 69962b5 11bb884 033e470 5d3757f 033e470 69fb57b 033e470 69fb57b 095e948 5d3757f 11bb884 876505c 69fb57b b59ab77 033e470 07a4d78 b59ab77 07a4d78 b59ab77 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import gradio as gr
from utils import update_db_hub
from preprocessing import read_file, smart_chunking
import datetime
def process_file(files, topic):
"""
Processes an uploaded file, extracts its text content, and saves it to the database.
This function performs the following steps:
1. Reads the content of the uploaded file using the `read_file` function.
- Supports `.docx`, `.txt`, and `.pdf` file formats.
2. Splits the extracted text into chunks (if applicable).
3. Saves the processed text and associated topics to the database using the `save_to_db` function.
4. Returns a success message if the file is processed and saved successfully.
If any error occurs during processing, the function catches the exception and returns an error message.
Parameters:
----------
file : object
The uploaded file object. The file's name (`file.name`) is used to determine the file path.
topic : list or str
A list of topics or a single topic string associated with the file. These are saved to the database along with the file content.
Returns:
-------
str
- A success message indicating that the file was processed and saved successfully.
- An error message if an exception occurs during processing.
Example:
--------
>>> process_file(uploaded_file, ["Persian Literature", "History"])
'File processed successfully! File saved to the database.'
>>> process_file(unsupported_file, ["Science"])
'Error processing file: Unsupported file format. Only .docx, .txt, and .pdf are allowed.'
"""
# progress = gr.Progress()
texts = []
topics = []
dates = []
log_history = [] # To store logs for each file
for i, file in enumerate(files):
# progress(i / len(files), desc=f"Processing file {i + 1}/{len(files)}: {file_path}")
try:
# Read the file content
file_path = file.name
text = read_file(file_path)
chucnks = smart_chunking(text)
# print(f"for file {file_path}", text[:1000])
for chunk in chucnks:
texts.append(chunk)
topics.append(topic)
dates.append(datetime.datetime.now().isoformat())
# Spl
# Save chunks to database
log_history.append( f"File {file_path} processed successfully! file saved to the database.")
except Exception as e:
log_history.append( f"Error processing for file {file_path}: {str(e)}")
print("save in db")
update_db_hub(texts, topics, dates)
# print('saved')
# progress(1.0, desc="Processing complete!")
return "\n".join(log_history)
# Define Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# Dataset Upload Interface")
with gr.Row():
file_input = gr.File(label="Upload File (.docx or .txt or .pdf)", file_count="multiple")
topic_input = gr.Textbox(label="Topics (comma-separated)", placeholder="e.g., science, technology, law, medicin")
submit_button = gr.Button("Upload and Process")
output_text = gr.Textbox(label="Status")
submit_button.click(process_file, inputs=[file_input, topic_input], outputs=output_text)
# Launch the app
demo.launch() |