File size: 1,065 Bytes
beccb39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import os
import sqlite3
from docx import Document

# Initialize tokenizer

def read_file(file_path):
    """Read text from Word or Text files."""
    if file_path.endswith('.docx'):
        doc = Document(file_path)
        return "\n".join([para.text for para in doc.paragraphs])
    elif file_path.endswith('.txt'):
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    else:
        raise ValueError("Unsupported file format. Only .docx and .txt are allowed.")

s

def save_to_db(chunks, topics=None):
    """Save chunks to SQLite database."""
    conn = sqlite3.connect('dataset.db')
    cursor = conn.cursor()

    # Create table if not exists
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS documents (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            text TEXT,
            topics TEXT
        )
    ''')

    # Insert chunks into the database
    for chunk in chunks:
        cursor.execute('INSERT INTO documents (text, topics) VALUES (?, ?)', (chunk, topics))

    conn.commit()
    conn.close()