Spaces:
				
			
			
	
			
			
					
		Running
		
			on 
			
			CPU Upgrade
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
			on 
			
			CPU Upgrade
	File size: 5,352 Bytes
			
			| 2e2dda5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 | import os
import json
import boto3
import io
from io import BytesIO
import sys
from pprint import pprint
from PyPDF2 import PdfWriter, PdfReader
import re
import shutil
import streamlit as st
file_content = {}
parent_dirname = "/".join((os.path.dirname(__file__)).split("/")[0:-1])
# if os.path.isdir(parent_dirname+"/split_pdf"):
#     shutil.rmtree(parent_dirname+"/split_pdf")
# os.mkdir(parent_dirname+"/split_pdf")
# if os.path.isdir(parent_dirname+"/split_pdf_csv"):
#     shutil.rmtree(parent_dirname+"/split_pdf_csv")
# os.mkdir(parent_dirname+"/split_pdf_csv")
def get_rows_columns_map(table_result, blocks_map):
    rows = {}
    #scores = []
    for relationship in table_result['Relationships']:
        if relationship['Type'] == 'CHILD':
            for child_id in relationship['Ids']:
                cell = blocks_map[child_id]
                if cell['BlockType'] == 'CELL':
                    row_index = cell['RowIndex']
                    col_index = cell['ColumnIndex']
                    if row_index not in rows:
                        # create new row
                        rows[row_index] = {}
                    
                    # get confidence score
                    #scores.append(str(cell['Confidence']))
                        
                    # get the text value
                    rows[row_index][col_index] = get_text(cell, blocks_map)
    return rows#, scores
def get_text(result, blocks_map):
    text = ''
    if 'Relationships' in result:
        for relationship in result['Relationships']:
            if relationship['Type'] == 'CHILD':
                for child_id in relationship['Ids']:
                    word = blocks_map[child_id]
                    if word['BlockType'] == 'WORD':
                        if "," in word['Text'] and word['Text'].replace(",", "").isnumeric():
                            text += '"' + word['Text'] + '"' +' '
                        else:
                            text += word['Text'] +' '
                    if word['BlockType'] == 'SELECTION_ELEMENT':
                        if word['SelectionStatus'] =='SELECTED':
                            text +=  'X '
    return text
def split_pages(file_name):
    
    inputpdf = PdfReader(open(file_name, "rb"))
    file_name_short = re.sub('[^A-Za-z0-9]+', '', (file_name.split("/")[-1].split(".")[0]).lower())
    for i in range(len(inputpdf.pages)):
        
        output = PdfWriter()
        output.add_page(inputpdf.pages[i])
        split_file = parent_dirname+"/split_pdf/"+file_name_short+"%s.pdf" % i
        
        with open(split_file, "wb") as outputStream:
            output.write(outputStream)
        table_csv = get_table_csv_results(split_file)
        if(table_csv != "<b> NO Table FOUND </b>"):
            
            output_file = parent_dirname+"/split_pdf_csv/"+file_name_short+"%s.csv" % i
            file_content[output_file] = table_csv
            # replace content
            with open(output_file, "wt") as fout:
                fout.write(table_csv)
            # show the results
            print('CSV OUTPUT FILE: ', output_file)
    return file_content
def get_table_csv_results(file_name):
    with open(file_name, 'rb') as file:
        img_test = file.read()
        bytes_test = bytearray(img_test)
        #print('Image loaded', file_name)
    # process using image bytes
    # get the results
    #session = boto3.Session(profile_name='profile-name')
    client = boto3.client('textract',aws_access_key_id=st.secrets['user_access_key'],
                aws_secret_access_key=st.secrets['user_secret_key'], region_name = 'us-east-1')
    # {'S3Object': {
    #         'Bucket': 'ml-search-app-access',
    #         'Name': 'covid19_ie_removed.pdf'
    #     }}
    
    response = client.analyze_document(Document={'Bytes': bytes_test}, FeatureTypes=['TABLES'])
    # Get the text blocks
    blocks=response['Blocks']
    #pprint(blocks)
    blocks_map = {}
    table_blocks = []
    for block in blocks:
        blocks_map[block['Id']] = block
        if block['BlockType'] == "TABLE":
            table_blocks.append(block)
    if len(table_blocks) <= 0:
        return "<b> NO Table FOUND </b>"
    csv = ''
    for index, table in enumerate(table_blocks):
        csv += generate_table_csv(table, blocks_map, index +1)
        csv += '\n\n'
    
    return csv
def generate_table_csv(table_result, blocks_map, table_index):
    rows = get_rows_columns_map(table_result, blocks_map)
    table_id = 'Table_' + str(table_index)
    
    # get cells.
    csv = ''#Table: {0}\n\n'.format(table_id)
    for row_index, cols in rows.items():
        for col_index, text in cols.items():
            col_indices = len(cols.items())
            csv += text.strip()+"`" #'{}'.format(text) + ","
        csv += '\n'
        
    # csv += '\n\n Confidence Scores % (Table Cell) \n'
    # cols_count = 0
    # for score in scores:
    #     cols_count += 1
    #     csv += score + ","
    #     if cols_count == col_indices:
    #         csv += '\n'
    #         cols_count = 0
    csv += '\n\n\n'
    return csv
def main_(file_name):
    table_csv = split_pages(file_name)
    #print(table_csv)
    return table_csv
    
# if __name__ == "__main__":
#     file_name = "/home/ubuntu/covid19_ie_removed.pdf"
#     main(file_name)
 | 
