from transformers import pipeline, TrOCRProcessor, VisionEncoderDecoderModel, T5ForConditionalGeneration, T5Tokenizer
from pdf2image import convert_from_path, convert_from_bytes
from IPython.display import clear_output
from PIL import Image
import cv2
import numpy as np
import torch
import gradio as gr

MIN_BOX_WIDTH = 8 # Минимальная ширина текстовой области (в пикселях)
MIN_BOX_HEIGHT = 15 # Минимальная высота текстовой области (в пикселях)
MAX_PART_WIDTH = 600 # Максимальная ширина части строки (в пикселях)
BOX_HEIGHT_TOLERANCE = 8 # Максимальная разница между высотами текстовых областей для добавлению в строку (в пикселях)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-large-printed")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-large-printed")
model.to(device)

summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device)

model_translation = T5ForConditionalGeneration.from_pretrained('utrobinmv/t5_translate_en_ru_zh_small_1024')
model_translation.to(device)
tokenizer_translation = T5Tokenizer.from_pretrained('utrobinmv/t5_translate_en_ru_zh_small_1024')

def get_text_from_images(images):
  extracted_text = []
  image_number = 0
  for image in images:
    image_number += 1
    image_cv = np.array(image)
    image_cv = cv2.cvtColor(image_cv, cv2.COLOR_RGB2BGR)

    gray = cv2.cvtColor(image_cv, cv2.COLOR_BGR2GRAY)
    thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2)
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    bounding_boxes = [cv2.boundingRect(contour) for contour in contours]

    def group_boxes_into_lines(boxes, tolerance=BOX_HEIGHT_TOLERANCE):
        sorted_boxes = sorted(boxes, key=lambda box: box[1])

        lines = []
        current_line = []

        for box in sorted_boxes:
            x, y, w, h = box

            if not current_line:
                current_line.append(box)
            else:
                last_box = current_line[-1]
                last_y = last_box[1]

                if abs(y - last_y) <= tolerance:
                    current_line.append(box)
                else:
                    lines.append(current_line)
                    current_line = [box]

        if current_line:
            lines.append(current_line)

        return lines

    lines = group_boxes_into_lines(bounding_boxes)

    line_number = 0
    for line in lines:
        line_number += 1

        x_coords = [box[0] for box in line]
        y_coords = [box[1] for box in line]
        widths = [box[2] for box in line]
        heights = [box[3] for box in line]

        x_min = min(x_coords)
        y_min = min(y_coords)
        x_max = max(x_coords[i] + widths[i] for i in range(len(line)))
        y_max = max(y_coords[i] + heights[i] for i in range(len(line)))

        line_image = image_cv[y_min:y_max, x_min:x_max]

        if line_image.size == 0 or line_image.shape[0] < MIN_BOX_HEIGHT or line_image.shape[1] < MIN_BOX_WIDTH:
            continue

        parts = []

        if line_image.shape[1] > MAX_PART_WIDTH:
          num_parts = (line_image.shape[1] // MAX_PART_WIDTH) + 1
          part_width = line_image.shape[1] // num_parts

          for i in range(num_parts):
            start_x = i * part_width
            end_x = (i + 1) * part_width if i < num_parts - 1 else line_image.shape[1]
            part = line_image[:, start_x:end_x]
            parts.append(part)
        else:
          parts.append(line_image)

        line_text = ""
        part_number = 0

        for part in parts:
          part_number += 1
          #clear_output()
          print(f"Images: {image_number}/{len(images)}")
          print(f"Lines: {line_number}/{len(lines)}")
          print(f"Parts: {part_number}/{len(parts)}")

          part_image_pil = Image.fromarray(cv2.cvtColor(part, cv2.COLOR_BGR2RGB))
          #display(part_image_pil)
          print("\n".join(extracted_text))

          pixel_values = processor(part_image_pil, return_tensors="pt").pixel_values
          pixel_values = pixel_values.to(device)
          generated_ids = model.generate(pixel_values)
          text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

          line_text += text

        extracted_text.append(line_text)

  final_text = "\n".join(extracted_text)
  return final_text

def summarize(text, max_length=300, min_length=150):
  result = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
  return result[0]['summary_text']

def translate(text):
  prefix = 'translate to ru: '
  src_text = prefix + text

  input_ids = tokenizer_translation(src_text, return_tensors="pt")

  generated_tokens = model_translation.generate(**input_ids.to(device))

  result = tokenizer_translation.batch_decode(generated_tokens, skip_special_tokens=True)
  return result[0]

def launch(images, language):
  if images == None or not images:
    return "No input provided."
  raw_text = get_text_from_images(images)
  summary = summarize(raw_text)
  if language == "rus":
    return translate(summary)
  return summary

def pdf_to_image(pdf, index = 0):
  images = convert_from_bytes(pdf)
  if 0 <= index < len(images):
    return [images[index]]
  return []

def pdf_to_images(pdf):
  images = convert_from_bytes(pdf)
  return images

def process_pdf(pdf_file, process_mode, page_index, language): 
  if process_mode == "all":
    return launch(pdf_to_images(pdf_file), language)
  elif process_mode == "single":
    return launch(pdf_to_image(pdf_file, page_index), language)

def process_images(images, language):
  pil_images = []
  for image in images:
    pil_images.append(Image.open(image))
  launch(pil_images, language)

class PrintToTextbox:
    def __init__(self, textbox):
        self.textbox = textbox
        self.buffer = ""

    def write(self, text):
        self.buffer += text
        self.textbox.update(self.buffer)

    def flush(self):
        pass

def update_page_index_visibility(process_mode):
    if process_mode == "single":
        return gr.update(visible=True)
    else:
        return gr.update(visible=False)

with gr.Blocks() as demo:
    gr.Markdown("# PDF and Image Text Summarizer")
    gr.Markdown("Upload a PDF file or images to extract and summarize text.")
    gr.Markdown("Takes about 10 minutes per page.")

    language = gr.Radio(choices=["rus", "eng"], label="Output Language", value="rus")
    
    with gr.Tabs():
        with gr.TabItem("PDF"):
            pdf_file = gr.File(label="Upload PDF File", type="binary")
            process_mode = gr.Radio(choices=["single", "all"], label="Process Mode", value="single")
            page_index = gr.Number(label="Page Index", value=0, precision=0)
            pdf_output = gr.Textbox(label="Extracted Text")
            pdf_button = gr.Button("Extract Text from PDF")
        
        with gr.TabItem("Images"):
            images = gr.Files(label="Upload Images", file_types=["image"])
            image_output = gr.Textbox(label="Extracted Text")
            image_button = gr.Button("Extract Text from Images")
    
    pdf_button.click(process_pdf, inputs=[pdf_file, process_mode, page_index, language], outputs=pdf_output)
    image_button.click(process_images, inputs=[images, language], outputs=image_output)
    process_mode.change(update_page_index_visibility, inputs=process_mode, outputs=page_index)

demo.launch(debug=True)