File size: 3,302 Bytes
b59ab77
5d3757f
11bb884
5d3757f
b59ab77
033e470
09d42b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
07a4d78
09d42b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69fb57b
09d42b9
876505c
5d3757f
 
 
 
 
69fb57b
095e948
876505c
095e948
033e470
 
 
 
11bb884
 
 
69962b5
11bb884
 
033e470
 
5d3757f
033e470
69fb57b
033e470
69fb57b
095e948
5d3757f
 
11bb884
876505c
69fb57b
b59ab77
 
 
 
 
033e470
07a4d78
b59ab77
 
 
07a4d78
b59ab77
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import gradio as gr
from utils import update_db_hub
from preprocessing import read_file, smart_chunking
import datetime

def process_file(files, topic):
    """
    Processes an uploaded file, extracts its text content, and saves it to the database.

    This function performs the following steps:
    1. Reads the content of the uploaded file using the `read_file` function.
       - Supports `.docx`, `.txt`, and `.pdf` file formats.
    2. Splits the extracted text into chunks (if applicable).
    3. Saves the processed text and associated topics to the database using the `save_to_db` function.
    4. Returns a success message if the file is processed and saved successfully.

    If any error occurs during processing, the function catches the exception and returns an error message.

    Parameters:
    ----------
    file : object
        The uploaded file object. The file's name (`file.name`) is used to determine the file path.
    topic : list or str
        A list of topics or a single topic string associated with the file. These are saved to the database along with the file content.

    Returns:
    -------
    str
        - A success message indicating that the file was processed and saved successfully.
        - An error message if an exception occurs during processing.

    Example:
    --------
    >>> process_file(uploaded_file, ["Persian Literature", "History"])
    'File processed successfully! File saved to the database.'

    >>> process_file(unsupported_file, ["Science"])
    'Error processing file: Unsupported file format. Only .docx, .txt, and .pdf are allowed.'
    
    """
    # progress = gr.Progress()


    texts = []
    topics = []
    dates = []
    log_history = []  # To store logs for each file
    for i, file in enumerate(files):
        # progress(i / len(files), desc=f"Processing file {i + 1}/{len(files)}: {file_path}")

        try:
            # Read the file content
            file_path = file.name
            text = read_file(file_path)
            chucnks = smart_chunking(text)
            # print(f"for file {file_path}", text[:1000])
            for chunk in chucnks:
                texts.append(chunk)
                topics.append(topic)
                dates.append(datetime.datetime.now().isoformat())
            # Spl
            # Save chunks to database
            
    
            log_history.append( f"File {file_path} processed successfully! file saved to the database.")
        except Exception as e:
            log_history.append( f"Error processing for file {file_path}: {str(e)}")

    print("save in db")
    update_db_hub(texts, topics, dates)
    # print('saved')
    # progress(1.0, desc="Processing complete!")
    return "\n".join(log_history)

# Define Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Dataset Upload Interface")
    with gr.Row():
        file_input = gr.File(label="Upload File (.docx or .txt or .pdf)",  file_count="multiple")
        topic_input = gr.Textbox(label="Topics (comma-separated)", placeholder="e.g., science, technology, law, medicin")
    submit_button = gr.Button("Upload and Process")
    output_text = gr.Textbox(label="Status")

    submit_button.click(process_file, inputs=[file_input, topic_input], outputs=output_text)

# Launch the app
demo.launch()