import os import re from pathlib import Path from bs4 import BeautifulSoup def process_html_file(file_path, output_path): with open(file_path, 'r', encoding='utf-8') as file: content = file.read() soup = BeautifulSoup(content, 'html.parser') # Find the Statement line statement_tag = soup.find('h3', string=re.compile(r'^Statement:')) if statement_tag: statement_text = statement_tag.string # Remove "in the table:" and everything after it new_statement = re.sub(r'\s*in the table:.*$', '', statement_text, flags=re.DOTALL) statement_tag.string.replace_with(new_statement) # Write the modified content with open(output_path, 'w', encoding='utf-8') as file: file.write(str(soup)) def process_directory(input_dir, output_dir): for root, dirs, files in os.walk(input_dir): for file in files: if file.endswith('.html'): input_path = Path(root) / file relative_path = input_path.relative_to(input_dir) output_path = Path(output_dir) / relative_path output_path.parent.mkdir(parents=True, exist_ok=True) process_html_file(input_path, output_path) # Define input and output directories input_directory = "htmls_DATER_mod" output_directory = "htmls_DATER_mod2" # Process the files process_directory(input_directory, output_directory) print("Processing complete. Modified files are in the output directory.")