""" This script cleans the OCR files, so that we have uniform documents with the same pre-processing applied to each of them. For every book, a new document is created so that the original file is always available for cross-checking etc. Code adapted from Travelogues project, by Jan Rörden. Source: https://github.com/travelogues/scripts/blob/master/groundtruth/ """ import os import re import string import unicodedata from tqdm import tqdm # directories books_original_dir = 'source/path/' output_dir = 'output/path/' # Ensure the cleaned directory exists os.makedirs(output_dir, exist_ok=True) # Function to remove accents and umlauts def remove_accents(input_str): # Normalize to decompose accents nfkd_form = unicodedata.normalize('NFKD', input_str) # Filter out diacritical marks return ''.join([c for c in nfkd_form if not unicodedata.combining(c)]) for fname in tqdm(sorted(os.listdir(books_original_dir))): # Save the current id for file naming later current_book_id = fname[:-4] # Process only .txt files if fname.endswith('.txt'): with open(os.path.join(books_original_dir, fname), 'r', encoding='utf-8') as f: cleaned_lines = [] page_lines = [] for line in f: # Replace long s and ß with normal s clean_line = re.sub(r'[ſß]', 's', line) # Remove accents and umlauts clean_line = remove_accents(clean_line) # Remove all non-word characters except whitespace and punctuation clean_line = re.sub(r'[^a-zA-Z0-9\s' + re.escape(string.punctuation) + ']', '', clean_line) # Convert to lowercase #clean_line = clean_line.lower() # Strip trailing spaces but keep line breaks clean_line = clean_line.rstrip() # Exclude lines based on criteria if len(clean_line) < 3 or clean_line.isdigit() or not re.search(r'[a-zA-Z]', clean_line): continue # Skip the line # Check for a new page indicated by a blank line if clean_line == "": # Handle empty pages if not page_lines or page_lines[0].startswith('statuscode') or page_lines[0].startswith(''): cleaned_lines.append("") else: cleaned_lines.extend(page_lines) page_lines = [] else: page_lines.append(clean_line) # Handle the last page if the file ends without a blank line if not page_lines or page_lines[0].startswith('statuscode') or page_lines[0].startswith(''): cleaned_lines.append("") else: cleaned_lines.extend(page_lines) # Save the cleaned text to a new file, retaining line breaks cleaned_file_path = os.path.join(output_dir, f"{current_book_id}_cleaned.txt") with open(cleaned_file_path, 'w', encoding='utf-8') as cleaned_file: cleaned_file.write('\n'.join(cleaned_lines)) # Write lines with original line breaks