Spaces:

Tzktz
/

Dit-document-layout-analysis

Running

File size: 1,732 Bytes

6fc683c

import json  
import hashlib  
import io  
import os  
import base64  
from PIL import Image  
from tqdm import tqdm

def calculate_md5(image):  
    md5_hash = hashlib.md5()  
    with io.BytesIO() as output:  
        image.save(output, format='JPEG')  
        image_data = output.getvalue()  
        md5_hash.update(image_data)  
    return md5_hash.hexdigest()  
  
def process_files(directory):  
    tsv_data = []  
  
    for file in tqdm(os.listdir(directory)):  
        if file.endswith('.json'):  
            json_path = os.path.join(directory, file)  
            jpg_path = os.path.join(directory, file.replace('.json', '.jpg'))  
  
            with open(json_path, 'r') as json_file:  
                data = json.load(json_file)  
  
            image = Image.open(jpg_path)  
            md5 = calculate_md5(image)  
            caption = data['caption']  
            width = data['width']  
            height = data['height']  
              
            with io.BytesIO() as buffer:  
                image.save(buffer, format='JPEG')  
                image_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8")  
  
            combined_data_str = {'phrase': data['noun_chunks'], 'expression_v1': data['ref_exps']}
  
            tsv_row = [md5, caption, image_base64, width, height, combined_data_str]  
            tsv_data.append('\t'.join(map(str, tsv_row)))  
  
    return tsv_data  
  
def write_tsv(tsv_data, output_file):  
    with open(output_file, 'w') as file:  
        file.write('\n'.join(tsv_data))  
  
if __name__ == '__main__':  
    directory = '/tmp/grit'  
    output_file = '/tmp/output.tsv'  
    tsv_data = process_files(directory)  
    write_tsv(tsv_data, output_file)