Danielrahmai1991 commited on
Commit
beccb39
·
verified ·
1 Parent(s): 00be21e

Update preprocessing.py

Browse files
Files changed (1) hide show
  1. preprocessing.py +39 -0
preprocessing.py CHANGED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sqlite3
3
+ from docx import Document
4
+
5
+ # Initialize tokenizer
6
+
7
+ def read_file(file_path):
8
+ """Read text from Word or Text files."""
9
+ if file_path.endswith('.docx'):
10
+ doc = Document(file_path)
11
+ return "\n".join([para.text for para in doc.paragraphs])
12
+ elif file_path.endswith('.txt'):
13
+ with open(file_path, 'r', encoding='utf-8') as f:
14
+ return f.read()
15
+ else:
16
+ raise ValueError("Unsupported file format. Only .docx and .txt are allowed.")
17
+
18
+ s
19
+
20
+ def save_to_db(chunks, topics=None):
21
+ """Save chunks to SQLite database."""
22
+ conn = sqlite3.connect('dataset.db')
23
+ cursor = conn.cursor()
24
+
25
+ # Create table if not exists
26
+ cursor.execute('''
27
+ CREATE TABLE IF NOT EXISTS documents (
28
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
29
+ text TEXT,
30
+ topics TEXT
31
+ )
32
+ ''')
33
+
34
+ # Insert chunks into the database
35
+ for chunk in chunks:
36
+ cursor.execute('INSERT INTO documents (text, topics) VALUES (?, ?)', (chunk, topics))
37
+
38
+ conn.commit()
39
+ conn.close()