Spaces:

Hieucyber2208
/

know-flow

Running

File size: 9,146 Bytes

e62cec6

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Cài đặt thư viện\n",
    "API Free của Gemini nên không cần private"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import fitz  # PyMuPDF\n",
    "from docx import Document\n",
    "import google.generativeai as genai\n",
    "from dotenv import load_dotenv\n",
    "\n",
    "# Load biến môi trường từ .env\n",
    "load_dotenv()\n",
    "GOOGLE_API_KEY = os.getenv(\"GOOGLE_API_KEY\")\n",
    "genai.configure(api_key=GOOGLE_API_KEY)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Thiết lập hàm đọc file\n",
    "Chấp nhận hai định dạng là .doc và .pdf. Đảm bảo file bài giảng nhiều chữ."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_text_from_pdf(pdf_path):\n",
    "    # Mở file PDF\n",
    "    doc = fitz.open(pdf_path)\n",
    "    text = \"\"\n",
    "    for page_num in range(doc.page_count):\n",
    "        page = doc.load_page(page_num)\n",
    "        text += page.get_text()\n",
    "    return text\n",
    "\n",
    "def extract_text_from_docx(docx_path):\n",
    "    # Mở file DOCX\n",
    "    doc = Document(docx_path)\n",
    "    text = \"\"\n",
    "    for para in doc.paragraphs:\n",
    "        text += para.text + \"\\n\"\n",
    "    return text\n",
    "\n",
    "def extract_text_from_file(file_path):\n",
    "    # Kiểm tra loại file và gọi hàm tương ứng\n",
    "    file_extension = os.path.splitext(file_path)[1].lower()\n",
    "\n",
    "    if file_extension == '.pdf':\n",
    "        return extract_text_from_pdf(file_path)\n",
    "    elif file_extension == '.docx':\n",
    "        return extract_text_from_docx(file_path)\n",
    "    else:\n",
    "        raise ValueError(\"Unsupported file format. Only PDF and DOCX are supported.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "text = extract_text_from_file(\"../data/input/sample.pdf\")\n",
    "with open(\"../data/text/text.txt\", \"w\", encoding=\"utf-8\") as f:\n",
    "    f.write(text)  "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Gọi API tiến hành chia đoạn và phân tích tóm tắt\n",
    "Đưa ra phân tích và lưu lại file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def split_text_by_semantics(text, number_of_chunks):\n",
    "    prompt = f\"\"\"\n",
    "    Bạn là một chuyên gia xử lý văn bản. Hãy chia văn bản sau thành {number_of_chunks} đoạn có ý nghĩa sao cho mỗi đoạn vừa đủ để giải thích trong khoảng 3 đến 5 câu.\n",
    "\n",
    "    Văn bản:\n",
    "    {text}\n",
    "\n",
    "    Định dạng đầu ra:\n",
    "    - Phần 1: [Nội dung]\n",
    "    - Phần 2: [Nội dung]\n",
    "    - Phần 3: [Nội dung]\n",
    "    \"\"\"\n",
    "\n",
    "    try:\n",
    "        model = genai.GenerativeModel(\"gemini-pro\")\n",
    "        response = model.generate_content(prompt)\n",
    "        result_text = response.text.strip()\n",
    "\n",
    "        chunks = result_text.split(\"- Phần \")\n",
    "        chunks = [chunk.strip() for chunk in chunks if chunk]\n",
    "        return chunks\n",
    "    except Exception as e:\n",
    "        print(f\"Lỗi khi gọi API Gemini: {e}\")\n",
    "        return []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_explaination_for_chunks(chunks, analysis_level='basic', style='academic', word_limit=100):\n",
    "    \"\"\"\n",
    "    Phân tích nội dung của văn bản theo mức độ và phong cách mong muốn.\n",
    "    \n",
    "    :param chunks: Danh sách các đoạn văn bản cần phân tích.\n",
    "    :param text: Toàn bộ văn bản gốc.\n",
    "    :param analysis_level: Mức độ phân tích ('basic' hoặc 'detailed').\n",
    "    :param style: Phong cách phân tích ('academic', 'popular', 'creative', 'humorous').\n",
    "    :param word_limit: Số từ ước lượng cho mỗi phần tóm tắt.\n",
    "    :return: Danh sách các phân tích tương ứng với từng đoạn.\n",
    "    \"\"\"\n",
    "    \n",
    "    level_prompts = {\n",
    "        'basic': \"Hãy đưa ra một bản tóm tắt ngắn gọn, tập trung vào nội dung chính.\",\n",
    "        'detailed': \"Hãy phân tích chuyên sâu từng phần, làm rõ ý nghĩa, ngữ cảnh và các yếu tố quan trọng.\"\n",
    "    }\n",
    "    \n",
    "    style_prompts = {\n",
    "        'academic': \"Phân tích theo phong cách học thuật, sử dụng ngôn ngữ chuyên sâu và lập luận chặt chẽ.\",\n",
    "        'popular': \"Trình bày theo phong cách phổ thông, dễ hiểu và phù hợp với nhiều đối tượng.\",\n",
    "        'creative': \"Giải thích một cách sáng tạo, sử dụng hình ảnh ẩn dụ và cách diễn đạt thú vị.\",\n",
    "        'humorous': \"Phân tích theo phong cách hài hước, thêm vào yếu tố vui nhộn và bất ngờ.\"\n",
    "    }\n",
    "    \n",
    "    overview_prompt = f\"\"\"\n",
    "    Đây là một văn bản có nội dung quan trọng. Bạn sẽ phân tích từng phần theo mức độ '{analysis_level}' và phong cách '{style}'.\n",
    "    Văn bản gồm các phần sau: {', '.join([f'Phần {i+1}' for i in range(len(chunks))])}.\n",
    "    {level_prompts[analysis_level]}\n",
    "    {style_prompts[style]}\n",
    "    Mỗi phần không vượt quá {word_limit} từ.\n",
    "    \"\"\"\n",
    "    \n",
    "    try:\n",
    "        model = genai.GenerativeModel(\"gemini-pro\")\n",
    "        response = model.generate_content(overview_prompt)\n",
    "        overview_text = response.text.strip()\n",
    "        \n",
    "        explanations = []\n",
    "        for idx, chunk in enumerate(chunks, start=1):\n",
    "            part_prompt = f\"\"\"\n",
    "            Phân tích phần {idx} của văn bản.\n",
    "            {level_prompts[analysis_level]}\n",
    "            {style_prompts[style]}\n",
    "            Nội dung phần này:\n",
    "            {chunk}\n",
    "            Hãy đảm bảo phần tóm tắt không vượt quá {word_limit} từ.\n",
    "            \"\"\"\n",
    "            \n",
    "            part_response = model.generate_content(part_prompt)\n",
    "            explanations.append(part_response.text.strip())\n",
    "        \n",
    "        return explanations\n",
    "    \n",
    "    except Exception as e:\n",
    "        print(f\"Lỗi khi gọi API Gemini: {e}\")\n",
    "        return []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Tách văn bản theo ngữ nghĩa sử dụng API Gemini\n",
    "semantic_chunks = split_text_by_semantics(text, number_of_chunks=3)\n",
    "\n",
    "# Tạo thuyết minh cho từng phần semantic chunk\n",
    "explainations = generate_explaination_for_chunks(semantic_chunks)\n",
    "\n",
    "# In kết quả\n",
    "for idx, explaination in enumerate(explainations, start=1):\n",
    "    print(f\"Giải thích cho Phần {idx}:\\n{explaination}\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Lưu từng câu vào tệp riêng biệt\n",
    "for idx, explaination in enumerate(explainations, start=1):\n",
    "    # Tách đoạn văn bản thành các câu dựa trên dấu chấm\n",
    "    sentences = explaination.split('.')\n",
    "    \n",
    "    # Lưu từng câu vào tệp riêng biệt\n",
    "    for sentence_idx, sentence in enumerate(sentences, start=1):\n",
    "        sentence = sentence.strip()  # Loại bỏ khoảng trắng thừa\n",
    "        if sentence:  # Kiểm tra nếu câu không rỗng\n",
    "            output_file = f\"../data/text/{idx}_{sentence_idx}.txt\"  # Tên tệp theo định dạng x_y.txt\n",
    "            with open(output_file, \"w\", encoding=\"utf-8\") as f:\n",
    "                f.write(f\"'{sentence}'\")  # Ghi câu trong dấu nháy đơn\n",
    "            print(f\"Đã lưu: {output_file}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}