Spaces:

Danielrahmai1991
/

dataset_interface

Sleeping

File size: 3,302 Bytes

import gradio as gr
from utils import update_db_hub
from preprocessing import read_file, smart_chunking
import datetime

def process_file(files, topic):
    """
    Processes an uploaded file, extracts its text content, and saves it to the database.

    This function performs the following steps:
    1. Reads the content of the uploaded file using the `read_file` function.
       - Supports `.docx`, `.txt`, and `.pdf` file formats.
    2. Splits the extracted text into chunks (if applicable).
    3. Saves the processed text and associated topics to the database using the `save_to_db` function.
    4. Returns a success message if the file is processed and saved successfully.

    If any error occurs during processing, the function catches the exception and returns an error message.

    Parameters:
    ----------
    file : object
        The uploaded file object. The file's name (`file.name`) is used to determine the file path.
    topic : list or str
        A list of topics or a single topic string associated with the file. These are saved to the database along with the file content.

    Returns:
    -------
    str
        - A success message indicating that the file was processed and saved successfully.
        - An error message if an exception occurs during processing.

    Example:
    --------
    >>> process_file(uploaded_file, ["Persian Literature", "History"])
    'File processed successfully! File saved to the database.'

    >>> process_file(unsupported_file, ["Science"])
    'Error processing file: Unsupported file format. Only .docx, .txt, and .pdf are allowed.'
    
    """
    # progress = gr.Progress()


    texts = []
    topics = []
    dates = []
    log_history = []  # To store logs for each file
    for i, file in enumerate(files):
        # progress(i / len(files), desc=f"Processing file {i + 1}/{len(files)}: {file_path}")

        try:
            # Read the file content
            file_path = file.name
            text = read_file(file_path)
            chucnks = smart_chunking(text)
            # print(f"for file {file_path}", text[:1000])
            for chunk in chucnks:
                texts.append(chunk)
                topics.append(topic)
                dates.append(datetime.datetime.now().isoformat())
            # Spl
            # Save chunks to database
            
    
            log_history.append( f"File {file_path} processed successfully! file saved to the database.")
        except Exception as e:
            log_history.append( f"Error processing for file {file_path}: {str(e)}")

    print("save in db")
    update_db_hub(texts, topics, dates)
    # print('saved')
    # progress(1.0, desc="Processing complete!")
    return "\n".join(log_history)

# Define Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Dataset Upload Interface")
    with gr.Row():
        file_input = gr.File(label="Upload File (.docx or .txt or .pdf)",  file_count="multiple")
        topic_input = gr.Textbox(label="Topics (comma-separated)", placeholder="e.g., science, technology, law, medicin")
    submit_button = gr.Button("Upload and Process")
    output_text = gr.Textbox(label="Status")

    submit_button.click(process_file, inputs=[file_input, topic_input], outputs=output_text)

# Launch the app
demo.launch()