Spaces:

Danielrahmai1991
/

dataset_interface

Sleeping

App Files Files Community

Danielrahmai1991 commited on 21 days ago

Commit

71654c6

verified ·

1 Parent(s): 7261a26

Update preprocessing.py

Browse files

Files changed (1) hide show

preprocessing.py +192 -4

preprocessing.py CHANGED Viewed

@@ -1,17 +1,205 @@
 import os
 import sqlite3
 from docx import Document
-# Initialize tokenizer
 def read_file(file_path):
-    """Read text from Word or Text files."""
     if file_path.endswith('.docx'):
         doc = Document(file_path)
-        return "\n".join([para.text for para in doc.paragraphs])
     elif file_path.endswith('.txt'):
         with open(file_path, 'r', encoding='utf-8') as f:
-            return f.read()
     else:
         raise ValueError("Unsupported file format. Only .docx and .txt are allowed.")

 import os
 import sqlite3
 from docx import Document
+import re
+from hazm import Normalizer
+def is_meaningful(text):
+    """
+    Determines whether the given text is considered meaningful based on the presence of a specific control character.
+    This function checks if the input text contains the ASCII control character '\\x19' (End of Medium).
+    If the character is found, the text is deemed not meaningful and the function returns 0. Otherwise,
+    the text is considered meaningful and the function returns 1.
+    Parameters:
+    ----------
+    text : str
+        The input text to be evaluated for meaningfulness.
+    Returns:
+    -------
+    int
+        - 0: If the text contains the '\\x19' control character, indicating it is not meaningful.
+        - 1: If the text does not contain the '\\x19' control character, indicating it is meaningful.
+    Example:
+    --------
+    >>> is_meaningful("This is a valid sentence.")
+    1
+    >>> is_meaningful("Invalid text \\x19 with control character.")
+    0
+    """
+    if "\x19" in text:
+        return 0
+    return 1
+# Step 1: Text Cleaning
+def clean_text(text):
+    """
+    Cleans the input text by removing unwanted patterns and retaining only Persian characters and spaces.
+    This function performs the following cleaning steps:
+    1. Removes URLs, emails, and other web-related patterns (e.g., http, https, www).
+    2. Replaces multiple consecutive spaces with a single space.
+    3. Retains only Persian characters (Unicode range \\u0600-\\u06FF) and spaces, removing all other characters.
+    4. Strips leading and trailing whitespace from the resulting text.
+    Parameters:
+    ----------
+    text : str
+        The input text to be cleaned.
+    Returns:
+    -------
+    str
+        The cleaned text containing only Persian characters and spaces, with unnecessary patterns removed.
+    Example:
+    --------
+    >>> clean_text("سلام! این یک متن آزمایشی است. http://example.com و ایمیل: [email protected]")
+    'سلام این یک متن آزمایشی است'
+    >>> clean_text("  متون   با فاصله های زیاد  ")
+    'متون با فاصله های زیاد'
+    """
+    # Remove URLs, emails, and other patterns
+    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
+    text = re.sub(r"\s+", " ", text)  # Replace multiple spaces with a single space
+    text = re.sub(r"[^\u0600-\u06FF\s]", "", text)  # Keep only Persian characters and spaces
+    return text.strip()
+# Step 2: Normalization
+def normalize_text(text):
+    """
+    Normalizes the input Persian text by standardizing characters and applying common normalization rules.
+    This function uses the `Normalizer` class from the `hazm` library to perform the following tasks:
+    1. Standardize Persian characters (e.g., converting Arabic characters to their Persian equivalents).
+    2. Apply common normalization rules such as fixing spacing, removing diacritics, and handling special cases.
+    Parameters:
+    ----------
+    text : str
+        The input Persian text to be normalized.
+    Returns:
+    -------
+    str
+        The normalized Persian text with standardized characters and consistent formatting.
+    Example:
+    --------
+    >>> normalize_text("سلامٔ دوست عزیز، حال شما چطور است؟")
+    'سلام دوست عزیز، حال شما چطور است؟'
+    >>> normalize_text("متن با اضافه‌ی فاصله‌های نامنظم.")
+    'متن با اضافه‌ی فاصله‌های نامنظم.'
+    """
+    normalizer = Normalizer()
+    text = normalizer.normalize(text)  # Standardize Persian characters
+    return text
+# Full Preprocessing Pipeline
+def preprocess_persian_text(text):
+    """
+    Preprocesses Persian text by cleaning and normalizing it.
+    This function performs the following steps:
+    1. Cleans the input text using the `clean_text` function:
+       - Removes URLs, emails, and other unwanted patterns.
+       - Replaces multiple spaces with a single space.
+       - Retains only Persian characters and spaces.
+    2. Normalizes the cleaned text using the `normalize_text` function:
+       - Standardizes Persian characters (e.g., converting Arabic characters to their Persian equivalents).
+       - Applies common normalization rules such as fixing spacing and removing diacritics.
+    Parameters:
+    ----------
+    text : str
+        The input Persian text to be preprocessed.
+    Returns:
+    -------
+    str
+        The preprocessed Persian text, which is cleaned and normalized.
+    Example:
+    --------
+    >>> preprocess_persian_text("سلامٔ دوست عزیز! این یک متن آزمایشی است: http://example.com")
+    'سلام دوست عزیز این یک متن آزمایشی است'
+    >>> preprocess_persian_text("  متون   با فاصله‌های نامنظم و کلمات عربی مثل شیء ")
+    'متون با فاصله‌های نامنظم و کلمات عربی مثل شیء'
+    """
+    text = clean_text(text)
+    text = normalize_text(text)
+    return text
 def read_file(file_path):
+    """
+    Reads and preprocesses text from Word (.docx), Text (.txt), or PDF (.pdf) files.
+    This function supports reading Persian text from the following file formats:
+    1. `.docx`: Extracts text from paragraphs in a Word document.
+    2. `.txt`: Reads plain text from a text file encoded in UTF-8.
+    3. `.pdf`: Extracts text from a PDF file using `pypdf`.
+    After extracting the raw text, the function preprocesses it using the `preprocess_persian_text` function,
+    which cleans and normalizes the Persian text.
+    Parameters:
+    ----------
+    file_path : str
+        The path to the input file. Supported formats are `.docx`, `.txt`, and `.pdf`.
+    Returns:
+    -------
+    str
+        The preprocessed Persian text extracted from the file.
+    Raises:
+    ------
+    ValueError
+        - If the file format is unsupported (only `.docx`, `.txt`, and `.pdf` are allowed).
+        - If the extracted text from a PDF file is deemed not meaningful (e.g., contains control characters).
+    Example:
+    --------
+    >>> read_file("example.docx")
+    'سلام دوست عزیز این یک متن آزمایشی است'
+    >>> read_file("example.txt")
+    'این یک فایل متنی ساده است.'
+    >>> read_file("example.pdf")
+    'این متن از یک فایل پی دی اف استخراج شده است.'
+    """
     if file_path.endswith('.docx'):
         doc = Document(file_path)
+        text =  "\n".join([para.text for para in doc.paragraphs])
+        return preprocess_persian_text(text)
     elif file_path.endswith('.txt'):
         with open(file_path, 'r', encoding='utf-8') as f:
+            text =  f.read()
+            return preprocess_persian_text(text)
+    elif file_path.endswith('.pdf'):
+        reader = pypdf.PdfReader(file_path)
+        raw_data = ""
+        for idx in range(len(reader.pages)):
+            raw_data += book_preprocessing(reader.pages[idx].extract_text())
+            if not is_meaningful(raw_data):
+                print("this text not supported")
+                raise ValueError("Unsupported file format.")
+        return preprocess_persian_text(raw_data)
     else:
         raise ValueError("Unsupported file format. Only .docx and .txt are allowed.")