File size: 4,379 Bytes
e62e0c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
"""
This script creates a CSV with all data to be indexed on the Marqo server.

Code by Michela Vignoli. Parts of this code were developed with assistance from GPT-4 and GPT-3 (free version).
"""

import os
import csv
import chardet
from tqdm import tqdm

# Helper function to get all file paths with a specific extension in a folder
def collect_files(folder, extension=".txt"):
    file_paths = []
    for root, _, files in os.walk(folder):
        for file in files:
            if file.endswith(extension):
                file_paths.append(os.path.join(root, file))
    return file_paths

# Function to process files and extract their text
def process_file(file_path):
    try:
        # Detect encoding
        with open(file_path, 'rb') as f:
            result = chardet.detect(f.read())
            encoding = result['encoding']

        # Read the file
        with open(file_path, 'r', encoding=encoding) as f:
            return f.read()
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# Combine data from clean, orig, and prep folders
def combine_data(clean_files, orig_files, prep_files):
    combined_data = []

    # Index files by (barcode, page) for matching
    def index_files(files):
        indexed = {}
        for file in files:
            barcode = os.path.basename(os.path.dirname(file))[:10]
            page = os.path.basename(file)[:5]
            indexed[(barcode, page)] = file
        return indexed

    clean_index = index_files(clean_files)
    orig_index = index_files(orig_files)
    prep_index = index_files(prep_files)

    # Process files and combine data
    for key in tqdm(clean_index.keys(), desc="Combining data", unit="file"):
        clean_file = clean_index.get(key)
        orig_file = orig_index.get(key)
        prep_file = prep_index.get(key)

        # Extract text
        text_clean = process_file(clean_file) if clean_file else None
        text_orig = process_file(orig_file) if orig_file else None
        text_prep = process_file(prep_file) if prep_file else None

        # Add combined data row
        barcode, page = key
        page_url = page[:5].zfill(8)
        iiif_link = f"https://iiif.onb.ac.at/images/ABO/{barcode}/{page_url}/full/full/0/native.jpg"

        combined_data.append({
            "barcode": barcode,
            "page": page,
            "iiif_link": iiif_link,
            "text_clean": text_clean,
            "text_orig": text_orig,
            "text_prep": text_prep,
        })

    return combined_data

# Lists of folders to process
clean_folders = [
    'source/path/DHd 2025 dataset/Sonnini Z166069305/Z166069305_clean/',
]
orig_folders = [
    "source/path/02-texts/D19/Z166069305",
]
prep_folders = [
    'source/path/DHd 2025 dataset/Sonnini Z166069305/Z166069305_clean_preprocessed/',
]

# Collect file paths
clean_files = [file for folder in clean_folders for file in collect_files(folder)]
orig_files = [file for folder in orig_folders for file in collect_files(folder)]
prep_files = [file for folder in prep_folders for file in collect_files(folder)]

# Combine data from all folders
all_data = combine_data(clean_files, orig_files, prep_files)

# Specify the file path and create the directory if it does not exist
csv_file = 'output/path/DHd_index.csv'
os.makedirs(os.path.dirname(csv_file), exist_ok=True)

# Write data to CSV file
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=["barcode", "page", "iiif_link", "text_clean", "text_orig", "text_prep"])
    writer.writeheader()
    writer.writerows(all_data)

#### IMPORTANT ####
#### Data Cleaning Needed after storing the file ####

"""
# Clean data
# Specify columns to check and update
columns_to_check = ["text_clean", "text_prep"]

# Check for rows where any of the columns contain "status code" or "empty page"
rows_to_update = index_DHd[columns_to_check].applymap(lambda x: any(keyword in str(x) for keyword in ["status code", "empty page"])).any(axis=1)

# Replace content in the specified columns for the identified rows
index_DHd.loc[rows_to_update, columns_to_check] = "<empty page>

# Remove artifacts from the LLM generation process
index_DHd['text_prep'] = index_DHd['text_prep'].str.strip("Here is the corrected text:")

""""

print(f"Data from all folders has been written to {csv_file}")