Spaces:
Sleeping
Sleeping
import os | |
import sqlite3 | |
from docx import Document | |
# Initialize tokenizer | |
def read_file(file_path): | |
"""Read text from Word or Text files.""" | |
if file_path.endswith('.docx'): | |
doc = Document(file_path) | |
return "\n".join([para.text for para in doc.paragraphs]) | |
elif file_path.endswith('.txt'): | |
with open(file_path, 'r', encoding='utf-8') as f: | |
return f.read() | |
else: | |
raise ValueError("Unsupported file format. Only .docx and .txt are allowed.") | |
s | |
def save_to_db(chunks, topics=None): | |
"""Save chunks to SQLite database.""" | |
conn = sqlite3.connect('dataset.db') | |
cursor = conn.cursor() | |
# Create table if not exists | |
cursor.execute(''' | |
CREATE TABLE IF NOT EXISTS documents ( | |
id INTEGER PRIMARY KEY AUTOINCREMENT, | |
text TEXT, | |
topics TEXT | |
) | |
''') | |
# Insert chunks into the database | |
for chunk in chunks: | |
cursor.execute('INSERT INTO documents (text, topics) VALUES (?, ?)', (chunk, topics)) | |
conn.commit() | |
conn.close() |