Spaces:

kai-sheng
/

text-extraction-and-image-captioning

Sleeping

File size: 3,409 Bytes

a426d06
f817ecb
a426d06
 
d7b2ea0
a426d06
 
 
8384356
a426d06
 
 
eae8233
 
 
 
a426d06
 
58f886c
a426d06
b0a604c
eae8233
b0a604c
 
eae8233
b0a604c
d7b2ea0
 
 
 
 
 
 
 
 
 
a426d06
 
 
 
 
 
 
 
 
8384356
a426d06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d7b2ea0
8384356
d7b2ea0
eae8233
d7b2ea0
 
 
 
 
 
 
a426d06
d7b2ea0
 
a426d06
d7b2ea0
 
a426d06
d7b2ea0
a426d06
d7b2ea0
a426d06
 
 
 
9b3d99a

import io
import os
from flask import Flask, request, jsonify
import base64
import pytesseract
import numpy as np
from pickle import load
from PIL import Image
from keras.applications.xception import Xception # to get pre-trained model Xception
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

print(os.popen(f'cat /etc/debian_version').read())
print(os.popen(f'cat /etc/issue').read())
print(os.popen(f'apt search tesseract').read())

app = Flask(__name__)

MAX_LENGTH = 38

# Set the TESSDATA_PREFIX environment variable
# os.environ['TESSDATA_PREFIX'] = '/cache/huggingface/downloads/tesseract-ocr/4.00/tessdata'

# Set the path to the Tesseract executable
# pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'

def format_tesseract_output(output_text):
    formatted_text = ""
    lines = output_text.strip().split("\n")
    for line in lines:
        line = line.strip()
        if line:
            formatted_text += line + "\n"
    return formatted_text


def extract_features(image_data, model):
    try:
        image = Image.open(io.BytesIO(image_data))
    except Exception as e:
        return None

    image = image.resize((299,299))
    image = np.array(image)

    # convert 4 channels image into 3 channels
    if image.shape[2] == 4:
        image = image[..., :3]

    image = np.expand_dims(image, axis=0)
    image = image/127.5
    image = image - 1.0
    feature = model.predict(image)

    return feature


def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None


def generate_desc(model, tokenizer, photo, max_length):
    in_text = 'start'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        pred = model.predict([photo,sequence], verbose=0)
        pred = np.argmax(pred)
        word = word_for_id(pred, tokenizer)
        if word is None or word == 'end':
            break
        in_text += ' ' + word
    return in_text.replace('start ', '')


# API endpoint to receive image and generate caption
@app.route('/api', methods=['POST'])
def generate_caption():
    try:
        base64_image_data = request.form['image']
        
        # Decode the Base64 string into binary image data
        image_data = base64.b64decode(base64_image_data)

        # Convert the image data to a PIL image object
        pil_image = Image.open(io.BytesIO(image_data))
        
        extracted_text = pytesseract.image_to_string(pil_image)
        hasText = bool(extracted_text.strip())
        
        if hasText:
            result = format_tesseract_output(extracted_text)
        else:
            tokenizer = load(open("tokenizer.p","rb"))
            model = load_model('model_9.keras')

            xception_model = Xception(include_top=False, pooling="avg")
            photo = extract_features(image_data, xception_model)

            if photo is None:
                return jsonify({'error': 'Failed to extract features from the image'}), 400
        
            result = generate_desc(model, tokenizer, photo, MAX_LENGTH)

        return jsonify({'hasText': hasText, 'result': result}), 200
    except Exception as e:
        return jsonify({'error': str(e)}), 500

if __name__ == '__main__':
    app.run()