import os import numpy as np import cv2 import pytesseract from pdf2image import convert_from_path from fastapi import FastAPI, UploadFile, File from transformers import AutoTokenizer, AutoModelForCausalLM app = FastAPI() # Hugging Face LLMの設定 MODEL_NAME = "deepseek-ai/deepseek-coder-1.3b" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) # OCR関数(決算短信PDF -> テキスト変換) def extract_text_from_pdf(pdf_path: str) -> str: images = convert_from_path(pdf_path) extracted_text = "" for image in images: img_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY) thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1] text = pytesseract.image_to_string(thresh, lang="jpn") extracted_text += text + "\n" return extracted_text # LLM要約関数(決算短信のOCRテキスト -> 投資家向け要約) def summarize_text(text: str) -> str: prompt = f"以下の決算短信を投資家向けに要約してください:\n{text}" inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096) output_ids = model.generate(inputs.input_ids, max_length=512, num_beams=5, early_stopping=True) return tokenizer.decode(output_ids[0], skip_special_tokens=True) # FastAPIエンドポイント(ファイルアップロード & OCR処理 & LLM要約) @app.post("/upload/") async def upload_pdf(file: UploadFile = File(...)): file_path = f"/tmp/{file.filename}" # ファイルを保存 with open(file_path, "wb") as f: f.write(await file.read()) # OCRでテキスト抽出 extracted_text = extract_text_from_pdf(file_path) # LLMで要約 summary = summarize_text(extracted_text) return {"summary": summary}