Tabular-LLM-Study-Debugging / modify_html_dater.py
luulinh90s's picture
update
8251e4d
raw
history blame
1.49 kB
import os
import re
from pathlib import Path
from bs4 import BeautifulSoup
def process_html_file(file_path, output_path):
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
soup = BeautifulSoup(content, 'html.parser')
# Find the Statement line
statement_tag = soup.find('h3', string=re.compile(r'^Statement:'))
if statement_tag:
statement_text = statement_tag.string
# Remove "in the table:" and everything after it
new_statement = re.sub(r'\s*in the table:.*$', '', statement_text, flags=re.DOTALL)
statement_tag.string.replace_with(new_statement)
# Write the modified content
with open(output_path, 'w', encoding='utf-8') as file:
file.write(str(soup))
def process_directory(input_dir, output_dir):
for root, dirs, files in os.walk(input_dir):
for file in files:
if file.endswith('.html'):
input_path = Path(root) / file
relative_path = input_path.relative_to(input_dir)
output_path = Path(output_dir) / relative_path
output_path.parent.mkdir(parents=True, exist_ok=True)
process_html_file(input_path, output_path)
# Define input and output directories
input_directory = "htmls_DATER_mod"
output_directory = "htmls_DATER_mod2"
# Process the files
process_directory(input_directory, output_directory)
print("Processing complete. Modified files are in the output directory.")