|
import os |
|
import sqlite3 |
|
from docx import Document |
|
|
|
|
|
|
|
def read_file(file_path): |
|
"""Read text from Word or Text files.""" |
|
if file_path.endswith('.docx'): |
|
doc = Document(file_path) |
|
return "\n".join([para.text for para in doc.paragraphs]) |
|
elif file_path.endswith('.txt'): |
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
return f.read() |
|
else: |
|
raise ValueError("Unsupported file format. Only .docx and .txt are allowed.") |
|
|
|
s |
|
|
|
def save_to_db(chunks, topics=None): |
|
"""Save chunks to SQLite database.""" |
|
conn = sqlite3.connect('dataset.db') |
|
cursor = conn.cursor() |
|
|
|
|
|
cursor.execute(''' |
|
CREATE TABLE IF NOT EXISTS documents ( |
|
id INTEGER PRIMARY KEY AUTOINCREMENT, |
|
text TEXT, |
|
topics TEXT |
|
) |
|
''') |
|
|
|
|
|
for chunk in chunks: |
|
cursor.execute('INSERT INTO documents (text, topics) VALUES (?, ?)', (chunk, topics)) |
|
|
|
conn.commit() |
|
conn.close() |