import os import sqlite3 from docx import Document # Initialize tokenizer def read_file(file_path): """Read text from Word or Text files.""" if file_path.endswith('.docx'): doc = Document(file_path) return "\n".join([para.text for para in doc.paragraphs]) elif file_path.endswith('.txt'): with open(file_path, 'r', encoding='utf-8') as f: return f.read() else: raise ValueError("Unsupported file format. Only .docx and .txt are allowed.") s def save_to_db(chunks, topics=None): """Save chunks to SQLite database.""" conn = sqlite3.connect('dataset.db') cursor = conn.cursor() # Create table if not exists cursor.execute(''' CREATE TABLE IF NOT EXISTS documents ( id INTEGER PRIMARY KEY AUTOINCREMENT, text TEXT, topics TEXT ) ''') # Insert chunks into the database for chunk in chunks: cursor.execute('INSERT INTO documents (text, topics) VALUES (?, ?)', (chunk, topics)) conn.commit() conn.close()