Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from utils import update_db_hub | |
| from preprocessing import read_file, smart_chunking | |
| import datetime | |
| def process_file(files, topic): | |
| """ | |
| Processes an uploaded file, extracts its text content, and saves it to the database. | |
| This function performs the following steps: | |
| 1. Reads the content of the uploaded file using the `read_file` function. | |
| - Supports `.docx`, `.txt`, and `.pdf` file formats. | |
| 2. Splits the extracted text into chunks (if applicable). | |
| 3. Saves the processed text and associated topics to the database using the `save_to_db` function. | |
| 4. Returns a success message if the file is processed and saved successfully. | |
| If any error occurs during processing, the function catches the exception and returns an error message. | |
| Parameters: | |
| ---------- | |
| file : object | |
| The uploaded file object. The file's name (`file.name`) is used to determine the file path. | |
| topic : list or str | |
| A list of topics or a single topic string associated with the file. These are saved to the database along with the file content. | |
| Returns: | |
| ------- | |
| str | |
| - A success message indicating that the file was processed and saved successfully. | |
| - An error message if an exception occurs during processing. | |
| Example: | |
| -------- | |
| >>> process_file(uploaded_file, ["Persian Literature", "History"]) | |
| 'File processed successfully! File saved to the database.' | |
| >>> process_file(unsupported_file, ["Science"]) | |
| 'Error processing file: Unsupported file format. Only .docx, .txt, and .pdf are allowed.' | |
| """ | |
| # progress = gr.Progress() | |
| texts = [] | |
| topics = [] | |
| dates = [] | |
| log_history = [] # To store logs for each file | |
| for i, file in enumerate(files): | |
| # progress(i / len(files), desc=f"Processing file {i + 1}/{len(files)}: {file_path}") | |
| try: | |
| # Read the file content | |
| file_path = file.name | |
| text = read_file(file_path) | |
| chucnks = smart_chunking(text) | |
| # print(f"for file {file_path}", text[:1000]) | |
| for chunk in chucnks: | |
| texts.append(chunk) | |
| topics.append(topic) | |
| dates.append(datetime.datetime.now().isoformat()) | |
| # Spl | |
| # Save chunks to database | |
| log_history.append( f"File {file_path} processed successfully! file saved to the database.") | |
| except Exception as e: | |
| log_history.append( f"Error processing for file {file_path}: {str(e)}") | |
| print("save in db") | |
| update_db_hub(texts, topics, dates) | |
| # print('saved') | |
| # progress(1.0, desc="Processing complete!") | |
| return "\n".join(log_history) | |
| # Define Gradio interface | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# Dataset Upload Interface") | |
| with gr.Row(): | |
| file_input = gr.File(label="Upload File (.docx or .txt or .pdf)", file_count="multiple") | |
| topic_input = gr.Textbox(label="Topics (comma-separated)", placeholder="e.g., science, technology, law, medicin") | |
| submit_button = gr.Button("Upload and Process") | |
| output_text = gr.Textbox(label="Status") | |
| submit_button.click(process_file, inputs=[file_input, topic_input], outputs=output_text) | |
| # Launch the app | |
| demo.launch() |