Spaces:
Sleeping
Sleeping
""" | |
This script matches the text annotations created on the original OCR files to the cleaned version of the text. | |
The annotations were created with Recogito https://recogito.pelagios.org/. | |
Code by Michela Vignoli. Parts of this code were developed with assistance from GPT-4 and GPT-3 (free version). | |
""" | |
## Import packages ## | |
import pandas as pd | |
import os | |
import re | |
from typing import Union | |
## Import annotations from Recogito ## | |
path_1 = "source/path/" | |
filename_1 = 'jiggvn0g5pgx34.csv' | |
# Function to reformat the labels | |
def reformat_labels(label_str): | |
labels = str(label_str).split('|') # Split the string by '|' | |
reformatted = ', '.join([f"'{label}'" for label in labels]) # Enclose each label in '' | |
return reformatted | |
df1 = pd.read_csv(os.path.join(path_1, filename_1))[["UUID", "FILE", "QUOTE_TRANSCRIPTION", "ANCHOR", "COMMENTS", "TAGS"]] | |
# Apply the function to the 'labels' column | |
df1['TAGS'] = df1['TAGS'].apply(reformat_labels) | |
## Extract page numbers from merged OCR text file ## | |
# Read the entire text file into a single string | |
with open('source/path/Z255430508_clean_merged.txt', 'r', encoding='utf-8') as file1: | |
text_content1 = file1.read() | |
# Function to find a number in the preceding character sequence | |
def find_number_before_position(text: str, position: int, search_length: int = 10000) -> Union[str, str]: | |
""" | |
Finds the last number following 'page' in the text preceding or succeeding the given position. | |
Parameters: | |
- text (str): The full text to search within. | |
- position (int): The position in the text to search around. | |
- search_length (int): The length of text to search before or after the position. | |
Returns: | |
- Union[str, str]: The last number found after 'page' in the preceding or succeeding text, | |
or a warning message "Warning: No matches found. Check!" if no match is found. | |
""" | |
if not isinstance(text, str): | |
raise ValueError("text must be a string") | |
if not isinstance(position, int) or position < 0: | |
raise ValueError("position must be a non-negative integer") | |
if not isinstance(search_length, int) or search_length < 0: | |
raise ValueError("search_length must be a non-negative integer") | |
if position == 0: | |
# Search after the position | |
following_text = text[position:position + search_length] | |
matches = re.findall(r'page(\d+)', following_text) | |
if matches: | |
return matches[0] # Return the first match found | |
else: | |
return "Check!" | |
else: | |
# Search before the position | |
start_position = max(0, position - search_length) | |
preceding_text = text[start_position:position] | |
matches = re.findall(r'page(\d+)', preceding_text) | |
if matches: | |
return matches[-1] # Return the last match found | |
else: | |
return "Check!" | |
# Apply the function to each row in the DataFrames | |
df1['PAGE'] = pd.to_numeric(df1['ANCHOR'].str.extract(r'(\d+)')[0], errors='coerce').apply(lambda x: find_number_before_position(text_content1, x)) | |
## Annotation analysis ## | |
# Split the labels by '|' and flatten the list of lists into a single list | |
all_labels = df1['TAGS'].str.split(', ').sum() | |
# Count the occurrences of each label | |
label_counts = pd.Series(all_labels).value_counts() | |
print(label_counts) | |