Spaces:

RockMi
/

onit-text-analysis

Sleeping

File size: 3,364 Bytes

e62e0c5

"""
This script matches the text annotations created on the original OCR files to the cleaned version of the text.
The annotations were created with Recogito https://recogito.pelagios.org/.

Code by Michela Vignoli. Parts of this code were developed with assistance from GPT-4 and GPT-3 (free version).
"""

## Import packages ##

import pandas as pd
import os
import re
from typing import Union


## Import annotations from Recogito ##

path_1 = "source/path/"
filename_1 = 'jiggvn0g5pgx34.csv'

# Function to reformat the labels
def reformat_labels(label_str):
    labels = str(label_str).split('|')  # Split the string by '|'
    reformatted = ', '.join([f"'{label}'" for label in labels])  # Enclose each label in ''
    return reformatted

df1 = pd.read_csv(os.path.join(path_1, filename_1))[["UUID", "FILE", "QUOTE_TRANSCRIPTION", "ANCHOR", "COMMENTS", "TAGS"]]

# Apply the function to the 'labels' column
df1['TAGS'] = df1['TAGS'].apply(reformat_labels)


## Extract page numbers from merged OCR text file ##

# Read the entire text file into a single string
with open('source/path/Z255430508_clean_merged.txt', 'r', encoding='utf-8') as file1:
    text_content1 = file1.read()


# Function to find a number in the preceding character sequence
def find_number_before_position(text: str, position: int, search_length: int = 10000) -> Union[str, str]:
    """
    Finds the last number following 'page' in the text preceding or succeeding the given position.

    Parameters:
    - text (str): The full text to search within.
    - position (int): The position in the text to search around.
    - search_length (int): The length of text to search before or after the position.

    Returns:
    - Union[str, str]: The last number found after 'page' in the preceding or succeeding text,
                        or a warning message "Warning: No matches found. Check!" if no match is found.
    """
    if not isinstance(text, str):
        raise ValueError("text must be a string")
    if not isinstance(position, int) or position < 0:
        raise ValueError("position must be a non-negative integer")
    if not isinstance(search_length, int) or search_length < 0:
        raise ValueError("search_length must be a non-negative integer")
    
    if position == 0:
        # Search after the position
        following_text = text[position:position + search_length]
        matches = re.findall(r'page(\d+)', following_text)
        if matches:
            return matches[0]  # Return the first match found
        else:
            return "Check!"
    else:
        # Search before the position
        start_position = max(0, position - search_length)
        preceding_text = text[start_position:position]
        matches = re.findall(r'page(\d+)', preceding_text)
        if matches:
            return matches[-1]  # Return the last match found
        else:
            return "Check!"

# Apply the function to each row in the DataFrames
df1['PAGE'] = pd.to_numeric(df1['ANCHOR'].str.extract(r'(\d+)')[0], errors='coerce').apply(lambda x: find_number_before_position(text_content1, x))


## Annotation analysis ##

# Split the labels by '|' and flatten the list of lists into a single list
all_labels = df1['TAGS'].str.split(', ').sum()

# Count the occurrences of each label
label_counts = pd.Series(all_labels).value_counts()

print(label_counts)