dataset_interface / preprocessing.py
Danielrahmai1991's picture
Update preprocessing.py
beccb39 verified
raw
history blame
1.07 kB
import os
import sqlite3
from docx import Document
# Initialize tokenizer
def read_file(file_path):
"""Read text from Word or Text files."""
if file_path.endswith('.docx'):
doc = Document(file_path)
return "\n".join([para.text for para in doc.paragraphs])
elif file_path.endswith('.txt'):
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()
else:
raise ValueError("Unsupported file format. Only .docx and .txt are allowed.")
s
def save_to_db(chunks, topics=None):
"""Save chunks to SQLite database."""
conn = sqlite3.connect('dataset.db')
cursor = conn.cursor()
# Create table if not exists
cursor.execute('''
CREATE TABLE IF NOT EXISTS documents (
id INTEGER PRIMARY KEY AUTOINCREMENT,
text TEXT,
topics TEXT
)
''')
# Insert chunks into the database
for chunk in chunks:
cursor.execute('INSERT INTO documents (text, topics) VALUES (?, ?)', (chunk, topics))
conn.commit()
conn.close()