File size: 9,684 Bytes
a265560
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7907ad4
 
a265560
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
import pandas as pd
import argparse
import glob
import os
import re
from tools.helper_functions import output_folder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import numpy as np
import random
import string
from typing import List

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

similarity_threshold = 0.9

stop_words = set(stopwords.words('english'))
# List of words to remove from the stopword set
#words_to_remove = ['no', 'nor', 'not', 'don', 'don't', 'wasn', 'wasn't', 'weren', 'weren't', "don't", "wasn't", "weren't"]

# Remove the specified words from the stopwords set
#for word in words_to_remove:
#    stop_words.discard(word.lower())
    
stemmer = PorterStemmer()
vectorizer = TfidfVectorizer()

def combine_ocr_output_text(input_files):
    """
    Combines text from multiple CSV files containing page and text columns.
    Groups text by file and page number, concatenating text within these groups.
    
    Args:
        input_files (list): List of paths to CSV files
    
    Returns:
        pd.DataFrame: Combined dataframe with columns [file, page, text]
    """
    all_data = []
    output_files = []

    if isinstance(input_files, str):
        file_paths_list = [input_files]
    else:
        file_paths_list = input_files
    
    for file in file_paths_list:

        if isinstance(file, str):
            file_path = file
        else:
            file_path = file.name

        # Read CSV file
        df = pd.read_csv(file_path)
        
        # Ensure required columns exist
        if 'page' not in df.columns or 'text' not in df.columns:
            print(f"Warning: Skipping {file_path} - missing required columns 'page' and 'text'")
            continue

        df['text'] = df['text'].fillna('').astype(str)
        
        # Group by page and concatenate text
        grouped = df.groupby('page')['text'].apply(' '.join).reset_index()
        
        # Add filename column
        grouped['file'] = os.path.basename(file_path)
        
        all_data.append(grouped)
    
    if not all_data:
        raise ValueError("No valid CSV files were processed")
    
    # Combine all dataframes
    combined_df = pd.concat(all_data, ignore_index=True)
    
    # Reorder columns
    combined_df = combined_df[['file', 'page', 'text']]

    output_combined_file_path = output_folder + "combined_ocr_output_files.csv"
    combined_df.to_csv(output_combined_file_path, index=None)

    output_files.append(output_combined_file_path)
    
    return combined_df, output_files

def process_data(df, column:str):
    '''
    Clean and stem text columns in a data frame
    '''
    
    def _clean_text(raw_text):
        # Remove HTML tags
        clean = re.sub(r'<.*?>', '', raw_text)
        clean = re.sub(r'&nbsp;', ' ', clean)
        clean = re.sub(r'\r\n', ' ', clean)
        clean = re.sub(r'&lt;', ' ', clean)
        clean = re.sub(r'&gt;', ' ', clean)
        clean = re.sub(r'<strong>', ' ', clean)
        clean = re.sub(r'</strong>', ' ', clean)

        # Replace non-breaking space \xa0 with a space
        clean = clean.replace(u'\xa0', u' ')
        # Remove extra whitespace
        clean = ' '.join(clean.split())

        # Tokenize the text
        words = word_tokenize(clean.lower())

        # Remove punctuation and numbers
        words = [word for word in words if word.isalpha()]

        # Remove stopwords
        words = [word for word in words if word not in stop_words]

        # Join the cleaned words back into a string
        return ' '.join(words)

    # Function to apply stemming
    def _apply_stemming(text):
        # Tokenize the text
        words = word_tokenize(text.lower())
        
        # Apply stemming to each word
        stemmed_words = [stemmer.stem(word) for word in words]
        
        # Join the stemmed words back into a single string
        return ' '.join(stemmed_words)




    df['text_clean'] = df[column].apply(_clean_text)
    df['text_clean'] = df['text_clean'].apply(_apply_stemming)
    
    return df

def identify_similar_pages(input_files:List[str]):

    output_paths = []

    df, output_files = combine_ocr_output_text(input_files)

    output_paths.extend(output_files)

    # Clean text
    df = process_data(df, 'text')

    # Vectorise text
    tfidf_matrix = vectorizer.fit_transform(df['text_clean'])

    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(tfidf_matrix)

    # Find the indices of the most similar pages
    np.fill_diagonal(similarity_matrix, 0)  # Ignore self-comparisons
    similar_pages = np.argwhere(similarity_matrix > similarity_threshold)  # Threshold of similarity

    #print(similar_pages)

    # Create a DataFrame for similar pairs and their scores
    similarity_df = pd.DataFrame({
        'Page1_Index': similar_pages[:, 0],
        'Page2_Index': similar_pages[:, 1],
        'Page1_File': similar_pages[:, 0],
        'Page2_File': similar_pages[:, 1],
        'Similarity_Score': similarity_matrix[similar_pages[:, 0], similar_pages[:, 1]]
    })

    # Filter out duplicate pairs (keep only one direction)
    similarity_df = similarity_df[similarity_df['Page1_Index'] < similarity_df['Page2_Index']]

    # Map the indices to their corresponding text and metadata
    similarity_df['Page1_File'] = similarity_df['Page1_File'].map(df['file'])
    similarity_df['Page2_File'] = similarity_df['Page2_File'].map(df['file'])

    similarity_df['Page1_Page'] = similarity_df['Page1_Index'].map(df['page'])
    similarity_df['Page2_Page'] = similarity_df['Page2_Index'].map(df['page'])

    similarity_df['Page1_Text'] = similarity_df['Page1_Index'].map(df['text'])
    similarity_df['Page2_Text'] = similarity_df['Page2_Index'].map(df['text'])

    similarity_df_out = similarity_df[['Page1_File', 'Page1_Page', 'Page2_File', 'Page2_Page', 'Similarity_Score', 'Page1_Text', 'Page2_Text']]
    similarity_df_out = similarity_df_out.sort_values(["Page1_File", "Page1_Page", "Page2_File", "Page2_Page", "Similarity_Score"], ascending=[True, True, True, True, False])

    # Save detailed results to a CSV file
    similarity_file_output_path = output_folder + 'page_similarity_results.csv'
    similarity_df_out.to_csv(similarity_file_output_path, index=False)

    output_paths.append(similarity_file_output_path)

    if not similarity_df_out.empty:
        unique_files = similarity_df_out['Page2_File'].unique()
        for redact_file in unique_files:
            output_file_name = output_folder + redact_file + "_whole_page.csv"
            whole_pages_to_redact_df = similarity_df_out.loc[similarity_df_out['Page2_File']==redact_file,:][['Page2_Page']]
            whole_pages_to_redact_df.to_csv(output_file_name, header=None, index=None)

            output_paths.append(output_file_name)            


    return similarity_df_out, output_paths

# Perturb text
# Apply the perturbation function with a 10% error probability
def perturb_text_with_errors(series):

    def _perturb_text(text, error_probability=0.1):
        words = text.split()  # Split text into words
        perturbed_words = []
        
        for word in words:
            if random.random() < error_probability:  # Add a random error
                perturbation_type = random.choice(['char_error', 'extra_space', 'extra_punctuation'])
                
                if perturbation_type == 'char_error':  # Introduce a character error
                    idx = random.randint(0, len(word) - 1)
                    char = random.choice(string.ascii_lowercase)  # Add a random letter
                    word = word[:idx] + char + word[idx:]
                
                elif perturbation_type == 'extra_space':  # Add extra space around a word
                    word = ' ' + word + ' '
                
                elif perturbation_type == 'extra_punctuation':  # Add punctuation to the word
                    punctuation = random.choice(string.punctuation)
                    idx = random.randint(0, len(word))  # Insert punctuation randomly
                    word = word[:idx] + punctuation + word[idx:]
            
            perturbed_words.append(word)
        
        return ' '.join(perturbed_words)

    series = series.apply(lambda x: _perturb_text(x, error_probability=0.1))

    return series

# Run through command line
# def main():
#     parser = argparse.ArgumentParser(description='Combine text from multiple CSV files by page')
#     parser.add_argument('input_pattern', help='Input file pattern (e.g., "input/*.csv")')
#     parser.add_argument('--output', '-o', default='combined_text.csv', 
#                        help='Output CSV file path (default: combined_text.csv)')

#     args = parser.parse_args()
    
#     # Get list of input files
#     input_files = glob.glob(args.input_pattern)
    
#     if not input_files:
#         print(f"No files found matching pattern: {args.input_pattern}")
#         return
    
#     print(f"Processing {len(input_files)} files...")
    
#     try:
#         # Combine the text from all files
#         combined_df = combine_ocr_output_text(input_files)
        
#         # Save to CSV
#         combined_df.to_csv(args.output, index=False)
#         print(f"Successfully created combined output: {args.output}")
#         print(f"Total pages processed: {len(combined_df)}")
        
#     except Exception as e:
#         print(f"Error processing files: {str(e)}")

# if __name__ == "__main__":
#     main()