import io from flask import Flask, request, jsonify import base64 import pytesseract import numpy as np import tensorflow as tf from pickle import load from PIL import Image from keras.applications.xception import Xception # to get pre-trained model Xception from keras.models import load_model from keras.preprocessing.sequence import pad_sequences app = Flask(__name__) MAX_LENGTH = 38 # Set up GPU memory growth physical_devices = tf.config.list_physical_devices('GPU') if physical_devices: try: # Allow memory growth for all GPUs for gpu in physical_devices: tf.config.experimental.set_memory_growth(gpu, True) print("GPU(s) memory growth set to True") except RuntimeError as e: print(e) def format_tesseract_output(output_text): formatted_text = "" lines = output_text.strip().split("\n") for line in lines: line = line.strip() if line: formatted_text += line + "\n" return formatted_text def extract_features(image_data, model): try: image = Image.open(io.BytesIO(image_data)) except Exception as e: print("ERROR: Can't open image! Ensure that image data is correct and in the expected format") print(str(e)) return None image = image.resize((299,299)) image = np.array(image) # convert 4 channels image into 3 channels if image.shape[2] == 4: image = image[..., :3] image = np.expand_dims(image, axis=0) image = image/127.5 image = image - 1.0 feature = model.predict(image) return feature def word_for_id(integer, tokenizer): for word, index in tokenizer.word_index.items(): if index == integer: return word return None def generate_desc(model, tokenizer, photo, max_length): in_text = 'start' for i in range(max_length): sequence = tokenizer.texts_to_sequences([in_text])[0] sequence = pad_sequences([sequence], maxlen=max_length) pred = model.predict([photo,sequence], verbose=0) pred = np.argmax(pred) word = word_for_id(pred, tokenizer) if word is None or word == 'end': break in_text += ' ' + word return in_text.replace('start ', '') # API endpoint to receive image and generate caption @app.route('/api', methods=['POST']) def generate_caption(): try: base64_image_data = request.form['image'] # Decode the Base64 string into binary image data image_data = base64.b64decode(base64_image_data) # Convert the image data to a PIL image object pil_image = Image.open(io.BytesIO(image_data)) extracted_text = pytesseract.image_to_string(pil_image, lang="eng+chi_sim+msa") hasText = bool(extracted_text.strip()) if hasText: result = format_tesseract_output(extracted_text) else: tokenizer = load(open("tokenizer.p","rb")) model = load_model('model_9.keras') xception_model = Xception(include_top=False, pooling="avg") photo = extract_features(image_data, xception_model) if photo is None: return jsonify({'error': 'Failed to extract features from the image'}), 400 result = generate_desc(model, tokenizer, photo, MAX_LENGTH) return jsonify({'hasText': hasText, 'result': result}), 200 except Exception as e: return jsonify({'error': str(e)}), 500 if __name__ == '__main__': app.run()