Danielrahmai1991 commited on
Commit
71654c6
·
verified ·
1 Parent(s): 7261a26

Update preprocessing.py

Browse files
Files changed (1) hide show
  1. preprocessing.py +192 -4
preprocessing.py CHANGED
@@ -1,17 +1,205 @@
1
  import os
2
  import sqlite3
3
  from docx import Document
 
 
4
 
5
- # Initialize tokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  def read_file(file_path):
8
- """Read text from Word or Text files."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  if file_path.endswith('.docx'):
10
  doc = Document(file_path)
11
- return "\n".join([para.text for para in doc.paragraphs])
 
12
  elif file_path.endswith('.txt'):
13
  with open(file_path, 'r', encoding='utf-8') as f:
14
- return f.read()
 
 
 
 
 
 
 
 
 
 
 
15
  else:
16
  raise ValueError("Unsupported file format. Only .docx and .txt are allowed.")
17
 
 
1
  import os
2
  import sqlite3
3
  from docx import Document
4
+ import re
5
+ from hazm import Normalizer
6
 
7
+ def is_meaningful(text):
8
+ """
9
+ Determines whether the given text is considered meaningful based on the presence of a specific control character.
10
+
11
+ This function checks if the input text contains the ASCII control character '\\x19' (End of Medium).
12
+ If the character is found, the text is deemed not meaningful and the function returns 0. Otherwise,
13
+ the text is considered meaningful and the function returns 1.
14
+
15
+ Parameters:
16
+ ----------
17
+ text : str
18
+ The input text to be evaluated for meaningfulness.
19
+
20
+ Returns:
21
+ -------
22
+ int
23
+ - 0: If the text contains the '\\x19' control character, indicating it is not meaningful.
24
+ - 1: If the text does not contain the '\\x19' control character, indicating it is meaningful.
25
+
26
+ Example:
27
+ --------
28
+ >>> is_meaningful("This is a valid sentence.")
29
+ 1
30
+
31
+ >>> is_meaningful("Invalid text \\x19 with control character.")
32
+ 0
33
+ """
34
+ if "\x19" in text:
35
+ return 0
36
+ return 1
37
+
38
+
39
+
40
+ # Step 1: Text Cleaning
41
+ def clean_text(text):
42
+ """
43
+ Cleans the input text by removing unwanted patterns and retaining only Persian characters and spaces.
44
+
45
+ This function performs the following cleaning steps:
46
+ 1. Removes URLs, emails, and other web-related patterns (e.g., http, https, www).
47
+ 2. Replaces multiple consecutive spaces with a single space.
48
+ 3. Retains only Persian characters (Unicode range \\u0600-\\u06FF) and spaces, removing all other characters.
49
+ 4. Strips leading and trailing whitespace from the resulting text.
50
+
51
+ Parameters:
52
+ ----------
53
+ text : str
54
+ The input text to be cleaned.
55
+
56
+ Returns:
57
+ -------
58
+ str
59
+ The cleaned text containing only Persian characters and spaces, with unnecessary patterns removed.
60
+
61
+ Example:
62
+ --------
63
+ >>> clean_text("سلام! این یک متن آزمایشی است. http://example.com و ایمیل: [email protected]")
64
+ 'سلام این یک متن آزمایشی است'
65
+
66
+ >>> clean_text(" متون با فاصله های زیاد ")
67
+ 'متون با فاصله های زیاد'
68
+ """
69
+ # Remove URLs, emails, and other patterns
70
+ text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
71
+ text = re.sub(r"\s+", " ", text) # Replace multiple spaces with a single space
72
+ text = re.sub(r"[^\u0600-\u06FF\s]", "", text) # Keep only Persian characters and spaces
73
+ return text.strip()
74
+
75
+
76
+ # Step 2: Normalization
77
+ def normalize_text(text):
78
+ """
79
+ Normalizes the input Persian text by standardizing characters and applying common normalization rules.
80
+
81
+ This function uses the `Normalizer` class from the `hazm` library to perform the following tasks:
82
+ 1. Standardize Persian characters (e.g., converting Arabic characters to their Persian equivalents).
83
+ 2. Apply common normalization rules such as fixing spacing, removing diacritics, and handling special cases.
84
+
85
+ Parameters:
86
+ ----------
87
+ text : str
88
+ The input Persian text to be normalized.
89
+
90
+ Returns:
91
+ -------
92
+ str
93
+ The normalized Persian text with standardized characters and consistent formatting.
94
+
95
+ Example:
96
+ --------
97
+ >>> normalize_text("سلامٔ دوست عزیز، حال شما چطور است؟")
98
+ 'سلام دوست عزیز، حال شما چطور است؟'
99
+
100
+ >>> normalize_text("متن با اضافه‌ی فاصله‌های نامنظم.")
101
+ 'متن با اضافه‌ی فاصله‌های نامنظم.'
102
+ """
103
+ normalizer = Normalizer()
104
+ text = normalizer.normalize(text) # Standardize Persian characters
105
+ return text
106
+
107
+
108
+ # Full Preprocessing Pipeline
109
+ def preprocess_persian_text(text):
110
+ """
111
+ Preprocesses Persian text by cleaning and normalizing it.
112
+
113
+ This function performs the following steps:
114
+ 1. Cleans the input text using the `clean_text` function:
115
+ - Removes URLs, emails, and other unwanted patterns.
116
+ - Replaces multiple spaces with a single space.
117
+ - Retains only Persian characters and spaces.
118
+ 2. Normalizes the cleaned text using the `normalize_text` function:
119
+ - Standardizes Persian characters (e.g., converting Arabic characters to their Persian equivalents).
120
+ - Applies common normalization rules such as fixing spacing and removing diacritics.
121
+
122
+ Parameters:
123
+ ----------
124
+ text : str
125
+ The input Persian text to be preprocessed.
126
+
127
+ Returns:
128
+ -------
129
+ str
130
+ The preprocessed Persian text, which is cleaned and normalized.
131
+
132
+ Example:
133
+ --------
134
+ >>> preprocess_persian_text("سلامٔ دوست عزیز! این یک متن آزمایشی است: http://example.com")
135
+ 'سلام دوست عزیز این یک متن آزمایشی است'
136
+
137
+ >>> preprocess_persian_text(" متون با فاصله‌های نامنظم و کلمات عربی مثل شیء ")
138
+ 'متون با فاصله‌های نامنظم و کلمات عربی مثل شیء'
139
+ """
140
+ text = clean_text(text)
141
+ text = normalize_text(text)
142
+ return text
143
+
144
+
145
 
146
  def read_file(file_path):
147
+ """
148
+ Reads and preprocesses text from Word (.docx), Text (.txt), or PDF (.pdf) files.
149
+
150
+ This function supports reading Persian text from the following file formats:
151
+ 1. `.docx`: Extracts text from paragraphs in a Word document.
152
+ 2. `.txt`: Reads plain text from a text file encoded in UTF-8.
153
+ 3. `.pdf`: Extracts text from a PDF file using `pypdf`.
154
+
155
+ After extracting the raw text, the function preprocesses it using the `preprocess_persian_text` function,
156
+ which cleans and normalizes the Persian text.
157
+
158
+ Parameters:
159
+ ----------
160
+ file_path : str
161
+ The path to the input file. Supported formats are `.docx`, `.txt`, and `.pdf`.
162
+
163
+ Returns:
164
+ -------
165
+ str
166
+ The preprocessed Persian text extracted from the file.
167
+
168
+ Raises:
169
+ ------
170
+ ValueError
171
+ - If the file format is unsupported (only `.docx`, `.txt`, and `.pdf` are allowed).
172
+ - If the extracted text from a PDF file is deemed not meaningful (e.g., contains control characters).
173
+
174
+ Example:
175
+ --------
176
+ >>> read_file("example.docx")
177
+ 'سلام دوست عزیز این یک متن آزمایشی است'
178
+
179
+ >>> read_file("example.txt")
180
+ 'این یک فایل متنی ساده است.'
181
+
182
+ >>> read_file("example.pdf")
183
+ 'این متن از یک فایل پی دی اف استخراج شده است.'
184
+ """
185
  if file_path.endswith('.docx'):
186
  doc = Document(file_path)
187
+ text = "\n".join([para.text for para in doc.paragraphs])
188
+ return preprocess_persian_text(text)
189
  elif file_path.endswith('.txt'):
190
  with open(file_path, 'r', encoding='utf-8') as f:
191
+ text = f.read()
192
+ return preprocess_persian_text(text)
193
+ elif file_path.endswith('.pdf'):
194
+ reader = pypdf.PdfReader(file_path)
195
+ raw_data = ""
196
+ for idx in range(len(reader.pages)):
197
+ raw_data += book_preprocessing(reader.pages[idx].extract_text())
198
+ if not is_meaningful(raw_data):
199
+ print("this text not supported")
200
+ raise ValueError("Unsupported file format.")
201
+ return preprocess_persian_text(raw_data)
202
+
203
  else:
204
  raise ValueError("Unsupported file format. Only .docx and .txt are allowed.")
205