|
import os |
|
import re |
|
from pathlib import Path |
|
from bs4 import BeautifulSoup |
|
|
|
|
|
def process_html_file(file_path, output_path): |
|
with open(file_path, 'r', encoding='utf-8') as file: |
|
content = file.read() |
|
|
|
soup = BeautifulSoup(content, 'html.parser') |
|
|
|
|
|
statement_tag = soup.find('h3', string=re.compile(r'^Statement:')) |
|
|
|
if statement_tag: |
|
statement_text = statement_tag.string |
|
|
|
new_statement = re.sub(r'\s*in the table:.*$', '', statement_text, flags=re.DOTALL) |
|
statement_tag.string.replace_with(new_statement) |
|
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as file: |
|
file.write(str(soup)) |
|
|
|
|
|
def process_directory(input_dir, output_dir): |
|
for root, dirs, files in os.walk(input_dir): |
|
for file in files: |
|
if file.endswith('.html'): |
|
input_path = Path(root) / file |
|
relative_path = input_path.relative_to(input_dir) |
|
output_path = Path(output_dir) / relative_path |
|
|
|
output_path.parent.mkdir(parents=True, exist_ok=True) |
|
process_html_file(input_path, output_path) |
|
|
|
|
|
|
|
input_directory = "htmls_DATER_mod" |
|
output_directory = "htmls_DATER_mod2" |
|
|
|
|
|
process_directory(input_directory, output_directory) |
|
|
|
print("Processing complete. Modified files are in the output directory.") |