File size: 30,804 Bytes
c2e3cf5
 
 
2721ce7
 
 
c2e3cf5
 
2721ce7
 
 
 
 
 
 
 
 
 
 
 
 
c2e3cf5
 
 
2721ce7
 
 
 
 
 
 
 
 
 
 
 
c2e3cf5
 
2721ce7
c2e3cf5
2721ce7
 
c2e3cf5
 
 
 
 
 
 
2721ce7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2e3cf5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2721ce7
c2e3cf5
 
 
 
 
 
 
 
 
2721ce7
 
c2e3cf5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2721ce7
 
 
 
c2e3cf5
2721ce7
 
 
c2e3cf5
2721ce7
c2e3cf5
 
2721ce7
c2e3cf5
 
 
 
 
2721ce7
c2e3cf5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2721ce7
c2e3cf5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2721ce7
c2e3cf5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2721ce7
c2e3cf5
 
 
2721ce7
 
 
c2e3cf5
 
 
 
 
 
 
 
 
2721ce7
 
 
 
 
 
 
c2e3cf5
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
import os
import json
import traceback
import re
import time
import random
from pathlib import Path
import tiktoken
import numpy as np
from PIL import Image # Pillow for image handling
import io # To handle image bytes

# Gemma imports
import jax.numpy as jnp
# For Gemma models, we need a specific setup to load the model
# For JAX/GPU memory allocation
os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"]="1.00"
from gemma import gm

# Sentence-Transformers for text embedding
from sentence_transformers import SentenceTransformer


# --- Configuration ---
# Set the desired Gemma model
GEMMA_MULTIMODAL_MODEL = "gemma-3.4b-it" # You can choose other Gemma variants if available and suitable

# Set the desired Sentence-Transformers model for text embeddings
# This is a good free, multilingual model.
SENTENCE_TRANSFORMER_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
# The dimension of embeddings for this model
EMBEDDING_DIMENSION = 384 # MiniLM-L12-v2 produces 384-dimensional embeddings


MAX_TOKENS_NORMAL = 500
ENCODING_NAME = "cl100k_base" # Used for token chunking, consistent

# Path configuration
BASE_DIR = Path("/content/") # Default for Colab environment
PDF_DIRECTORY = BASE_DIR / "docs"
OUTPUT_DIR = BASE_DIR / "output"
EMBEDDINGS_FILE_PATH = OUTPUT_DIR / "embeddings_statistiques_multimodal_gemma_st.json"

# Directory to save extracted images and tables HTML (within output)
IMAGE_SAVE_SUBDIR = "extracted_graphs"
TABLE_SAVE_SUBDIR = "extracted_tables"
IMAGE_SAVE_DIR = OUTPUT_DIR / IMAGE_SAVE_SUBDIR
TABLE_SAVE_DIR = OUTPUT_DIR / TABLE_SAVE_SUBDIR


# Global models
gemma_sampler = None
text_embedding_model = None

def initialize_models():
    """Initializes Gemma and Sentence-Transformers models."""
    global gemma_sampler, text_embedding_model
    
    print("✓ Initializing Gemma Multimodal Model...")
    try:
        model = gm.nn.Gemma3_4B() # Initialize Gemma model
        # Load Gemma parameters
        params = gm.ckpts.load_params(gm.ckpts.CheckpointPath.GEMMA3_4B_IT)
        gemma_sampler = gm.text.ChatSampler(model=model, params=params)
        print(f"✓ Gemma Multimodal Model '{GEMMA_MULTIMODAL_MODEL}' loaded successfully.")
    except Exception as e:
        print(f"❌ ERREUR: Échec du chargement du modèle multimodal Gemma : {str(e)}")
        print("⚠️ La génération de descriptions multimodales échouera.")
        gemma_sampler = None

    print(f"✓ Initializing Sentence-Transformers Model '{SENTENCE_TRANSFORMER_MODEL}'...")
    try:
        text_embedding_model = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL)
        print(f"✓ Modèle d'embedding textuel Sentence-Transformers '{SENTENCE_TRANSFORMER_MODEL}' chargé avec succès.")
    except Exception as e:
        print(f"❌ ERREUR: Échec du chargement du modèle d'embedding textuel Sentence-Transformers : {str(e)}")
        print("⚠️ La génération d'embeddings textuels échouera.")
        text_embedding_model = None


def clean_text(text):
    """Normalize whitespace and clean text while preserving paragraph breaks"""
    if not text:
        return ""
    text = text.replace('\t', ' ')
    text = re.sub(r' +', ' ', text)
    text = re.sub(r'\n{3,}', '\n\n', text)
    return text.strip()

# --- PDF Processing Functions (Mostly unchanged from previous version, but updated to use global paths) ---
import fitz # PyMuPDF
import camelot # For table extraction
import pandas as pd
from bs4 import BeautifulSoup

IMAGE_MIN_WIDTH = 100
IMAGE_MIN_HEIGHT = 100

def extract_page_data_pymupdf(pdf_path):
    """Extract text, tables and save images from each page using PyMuPDF and Camelot."""
    page_data_list = []
    try:
        doc = fitz.open(pdf_path)
        metadata = doc.metadata or {}
        pdf_data = {
            'pdf_title': metadata.get('title', pdf_path.name),
            'pdf_subject': metadata.get('subject', 'Statistiques'),
            'pdf_keywords': metadata.get('keywords', '')
        }

        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            page_index = page_num + 1  # 1-based index

            print(f"  Extraction des données de la page {page_index}...")

            # Extract tables first
            table_data = extract_tables_and_images_from_page(pdf_path, page, page_index)

            # Track table regions to avoid double-processing text
            table_regions = []
            for item in table_data:
                if 'rect' in item and item['rect'] and len(item['rect']) == 4:
                    table_regions.append(fitz.Rect(item['rect']))
                else:
                    print(f"  Warning: Invalid rect for table on page {page_index}")

            # Extract text excluding table regions
            page_text = ""
            if table_regions:
                blocks = page.get_text("blocks")
                for block in blocks:
                    block_rect = fitz.Rect(block[:4])
                    is_in_table = False
                    for table_rect in table_regions:
                        if block_rect.intersects(table_rect):
                            is_in_table = True
                            break
                    if not is_in_table:
                        page_text += block[4] + "\n"
            else:
                page_text = page.get_text("text")

            page_text = clean_text(page_text)

            # Extract and save images (excluding those identified as tables)
            image_data = extract_images_from_page(pdf_path, page, page_index, excluded_rects=table_regions)

            page_data_list.append({
                'pdf_file': pdf_path.name,
                'page_number': page_index,
                'text': page_text,
                'images': image_data,
                'tables': [item for item in table_data if item['content_type'] == 'table'],
                'pdf_title': pdf_data.get('pdf_title'),
                'pdf_subject': pdf_data.get('pdf_subject'),
                'pdf_keywords': pdf_data.get('pdf_keywords')
            })
        doc.close()
    except Exception as e:
        print(f"Erreur lors du traitement du PDF {pdf_path.name} avec PyMuPDF : {str(e)}")
        traceback.print_exc()
    return page_data_list


def extract_tables_and_images_from_page(pdf_path, page, page_num):
    """Extract tables using Camelot and capture images of table areas."""
    table_and_image_data = []
    try:
        tables = camelot.read_pdf(
            str(pdf_path),
            pages=str(page_num),
            flavor='lattice',
        )

        if len(tables) == 0:
            tables = camelot.read_pdf(
                str(pdf_path),
                pages=str(page_num),
                flavor='stream'
            )

        for i, table in enumerate(tables):
            if table.accuracy < 70:
                print(f"  Skipping low accuracy table ({table.accuracy:.2f}%) on page {page_num}")
                continue

            table_bbox = table.parsing_report.get('page_bbox', [0, 0, 0, 0])
            if not table_bbox or len(table_bbox) != 4:
                print(f"  Warning: Invalid bounding box for table {i} on page {page_num}. Skipping image capture.")
                table_rect = None
            else:
                table_rect = fitz.Rect(table_bbox)

            safe_pdf_name = "".join(c if c.isalnum() else "_" for c in pdf_path.stem)
            table_html_filename = f"{safe_pdf_name}_p{page_num}_table{i}.html"
            table_html_save_path = TABLE_SAVE_DIR / table_html_filename
            relative_html_url_path = f"/static/{TABLE_SAVE_SUBDIR}/{table_html_filename}"

            table_image_filename = f"{safe_pdf_name}_p{page_num}_table{i}.png"
            table_image_save_path = IMAGE_SAVE_DIR / table_image_filename
            relative_image_url_path = f"/static/{IMAGE_SAVE_SUBDIR}/{table_image_filename}"


            df = table.df
            html = f"<caption>Table extrait de {pdf_path.name}, page {page_num}</caption>\n" + df.to_html(index=False)
            soup = BeautifulSoup(html, 'html.parser')
            table_tag = soup.find('table')
            if table_tag:
                table_tag['class'] = 'table table-bordered table-striped'
                table_tag['style'] = 'width:100%; border-collapse:collapse;'

                style_tag = soup.new_tag('style')
                style_tag.string = """
                .table { border-collapse: collapse; width: 100%; margin-bottom: 1rem;}
                .table caption { caption-side: top; padding: 0.5rem; text-align: left; font-weight: bold; }
                .table th, .table td { border: 1px solid #ddd; padding: 8px; text-align: left; }
                .table th { background-color: #f2f2f2; font-weight: bold; }
                .table-striped tbody tr:nth-of-type(odd) { background-color: rgba(0,0,0,.05); }
                .table-responsive { overflow-x: auto; margin-bottom: 1rem; }
                """
                soup.insert(0, style_tag)

                div = soup.new_tag('div')
                div['class'] = 'table-responsive'
                table_tag.wrap(div)

                with open(table_html_save_path, 'w', encoding='utf-8') as f:
                    f.write(str(soup))
            else:
                print(f"  Warning: Could not find table tag in HTML for table on page {page_num}. Skipping HTML save.")
                continue

            table_image_bytes = None
            if table_rect:
                try:
                    pix = page.get_pixmap(clip=table_rect)
                    table_image_bytes = pix.tobytes(format='png')

                    with open(table_image_save_path, "wb") as img_file:
                        img_file.write(table_image_bytes)

                except Exception as img_capture_e:
                    print(f"  Erreur lors de la capture d'image du tableau {i} page {page_num} : {img_capture_e}")
                    traceback.print_exc()
                    table_image_bytes = None

            table_and_image_data.append({
                'content_type': 'table',
                'table_html_url': relative_html_url_path,
                'table_text_representation': df.to_string(index=False),
                'rect': [table_rect.x0, table_rect.y0, table_rect.x1, table_rect.y1] if table_rect else None,
                'accuracy': table.accuracy,
                'image_bytes': table_image_bytes,
                'image_url': relative_image_url_path if table_image_bytes else None
            })

        return table_and_image_data

    except Exception as e:
        print(f"  Erreur lors de l'extraction des tableaux de la page {page_num} : {str(e)}")
        traceback.print_exc()
        return []


def extract_images_from_page(pdf_path, page, page_num, excluded_rects=[]):
    """Extract and save images from a page, excluding specified regions (like tables)."""
    image_data = []
    image_list = page.get_images(full=True)

    for img_index, img_info in enumerate(image_list):
        xref = img_info[0]
        try:
            base_image = page.parent.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            width = base_image["width"]
            height = base_image["height"]

            if width < IMAGE_MIN_WIDTH or height < IMAGE_MIN_HEIGHT:
                continue

            img_rect = None
            img_rects = page.get_image_rects(xref)
            if img_rects:
                img_rect = img_rects[0]

            if img_rect is None:
                print(f"  Warning: Could not find rectangle for image {img_index} on page {page_num}. Skipping.")
                continue

            is_excluded = False
            for excluded_rect in excluded_rects:
                if img_rect.intersects(excluded_rect):
                    is_excluded = True
                    break
            if is_excluded:
                print(f"  Image {img_index} on page {page_num} is within an excluded region (e.g., table). Skipping.")
                continue

            safe_pdf_name = "".join(c if c.isalnum() else "_" for c in pdf_path.stem)
            image_filename = f"{safe_pdf_name}_p{page_num}_img{img_index}.{image_ext}"
            image_save_path = IMAGE_SAVE_DIR / image_filename
            relative_url_path = f"/static/{IMAGE_SAVE_SUBDIR}/{image_filename}"

            with open(image_save_path, "wb") as img_file:
                img_file.write(image_bytes)

            image_data.append({
                'content_type': 'image',
                'image_url': relative_url_path,
                'rect': [img_rect.x0, img_rect.y0, img_rect.x1, img_rect.y1],
                'image_bytes': image_bytes
            })

        except Exception as img_save_e:
            print(f"  Erreur lors du traitement de l'image {img_index} de la page {page_num} : {img_save_e}")
            traceback.print_exc()

    return image_data

# --- Embedding and Description Generation Functions (Modified for Gemma and Sentence-Transformers) ---

def token_chunking(text, max_tokens, encoding):
    """Chunk text based on token count with smarter boundaries (sentences, paragraphs)"""
    if not text:
        return []

    tokens = encoding.encode(text)
    chunks = []
    start_token_idx = 0

    while start_token_idx < len(tokens):
        end_token_idx = min(start_token_idx + max_tokens, len(tokens))

        if end_token_idx < len(tokens):
            look_ahead_limit = min(start_token_idx + max_tokens * 2, len(tokens))
            text_segment_to_check = encoding.decode(tokens[start_token_idx:look_ahead_limit])

            paragraph_break = text_segment_to_check.rfind('\n\n', 0, len(text_segment_to_check) - (look_ahead_limit - (start_token_idx + max_tokens)))
            if paragraph_break != -1:
                tokens_up_to_break = encoding.encode(text_segment_to_check[:paragraph_break])
                end_token_idx = start_token_idx + len(tokens_up_to_break)
            else:
                sentence_end = re.search(r'[.!?]\s+', text_segment_to_check[:len(text_segment_to_check) - (look_ahead_limit - (start_token_idx + max_tokens))][::-1])
                if sentence_end:
                    char_index_in_segment = len(text_segment_to_check) - 1 - sentence_end.start()
                    tokens_up_to_end = encoding.encode(text_segment_to_check[:char_index_in_segment + 1])
                    end_token_idx = start_token_idx + len(tokens_up_to_end)

        current_chunk_tokens = tokens[start_token_idx:end_token_idx]
        chunk_text = encoding.decode(current_chunk_tokens).strip()

        if chunk_text:
            chunks.append(chunk_text)

        if start_token_idx == end_token_idx:
            start_token_idx += 1
        else:
            start_token_idx = end_token_idx

    return chunks


def generate_multimodal_description(image_bytes, prompt_text, max_retries=5, delay=10):
    """
    Generate a text description for an image using the Gemma multimodal model.
    Returns description text or None if all retries fail or model is not initialized.
    """
    global gemma_sampler

    if gemma_sampler is None:
        print("  Skipping multimodal description generation: Gemma sampler is not initialized.")
        return None

    # Convert image bytes to PIL Image and then to JAX NumPy array
    try:
        pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
        # Gemma expects (H, W, C) numpy array, then converted to JAX numpy
        image_np = np.asarray(pil_image)
        gemma_image_input = jnp.asarray(image_np)
        # Gemma also expects batch dimension, so add it
        gemma_image_input = jnp.expand_dims(gemma_image_input, axis=0) # Shape: (1, H, W, C)
    except Exception as e:
        print(f"  Erreur lors de la conversion de l'image pour Gemma : {e}")
        return None

    for attempt in range(max_retries):
        try:
            time.sleep(delay + random.uniform(0, 5))

            # Gemma chat expects <img_token> special token for image insertion
            full_prompt = f"{prompt_text} <img>"

            # Use sampler.chat for turn-based interaction
            # The images argument accepts a JAX numpy array with shape (batch, num_images, H, W, C)
            # If a single image, it's (batch, 1, H, W, C)
            # We are currently passing a single image, so gemma_image_input is (1, H, W, C) already.
            # To pass it as `images`, it should be `(batch, num_images, H, W, C)`
            # So, if gemma_image_input is (1, H, W, C), for sampler.chat(images=...) it should be (1, 1, H, W, C)
            # Let's reshape it for the `images` argument.
            final_gemma_image_input = jnp.expand_dims(gemma_image_input, axis=1) # Shape: (1, 1, H, W, C)

            out = gemma_sampler.chat(
                full_prompt,
                images=final_gemma_image_input,
                max_tokens=500 # Limit response length
            )
            description = out.strip()

            if description:
                return description
            else:
                print(f"  Tentative {attempt+1}/{max_retries}: Réponse vide ou inattendue du modèle multimodal Gemma.")
                if attempt < max_retries - 1:
                    retry_delay = delay * (2 ** attempt) + random.uniform(1, 5)
                    print(f"  Réessai dans {retry_delay:.2f}s...")
                    time.sleep(retry_delay)
                continue

        except Exception as e:
            error_msg = str(e)
            print(f"  Tentative {attempt+1}/{max_retries} échouée pour la description (Gemma) : {error_msg}")
            # Gemma is local, so no API errors like 429. Focus on general errors.
            if attempt < max_retries - 1:
                retry_delay = delay * (2 ** attempt) + random.uniform(1, 5)
                print(f"  Réessai dans {retry_delay:.2f}s...")
                time.sleep(retry_delay)
                continue
            else:
                print(f"  Toutes les {max_retries} tentatives ont échoué pour la description Gemma.")
                return None
    print(f"  Toutes les {max_retries} tentatives ont échoué pour la description (fin de boucle).")
    return None


def generate_text_embedding(text_content, max_retries=5, delay=5):
    """
    Generate text embedding using the Sentence-Transformers model.
    Returns embedding vector (list) or None if all retries fail or model is not initialized.
    """
    global text_embedding_model

    if text_embedding_model is None:
        print("  Skipping text embedding generation: Sentence-Transformers model is not initialized.")
        return None

    if not text_content or not text_content.strip():
        return None # Cannot embed empty text

    for attempt in range(max_retries):
        try:
            time.sleep(delay + random.uniform(0, 0.5)) # Shorter delay for local model

            # Sentence-Transformers encode method
            embedding = text_embedding_model.encode(text_content, convert_to_numpy=True)
            if embedding is not None and len(embedding) == EMBEDDING_DIMENSION:
                return embedding.tolist() # Convert numpy array to list for JSON serialization
            else:
                print(f"  Tentative {attempt+1}/{max_retries}: Format d'embedding Sentence-Transformers inattendu. Réponse : {embedding}")
                return None

        except Exception as e:
            error_msg = str(e)
            print(f"  Tentative {attempt+1}/{max_retries} échouée pour l'embedding (Sentence-Transformers) : {error_msg}")
            if attempt < max_retries - 1:
                retry_delay = delay * (2 ** attempt) + random.uniform(0.5, 2)
                print(f"  Réessai dans {retry_delay:.2f}s...")
                time.sleep(retry_delay)
                continue
            else:
                print(f"  Toutes les {max_retries} tentatives ont échoué pour l'embedding (Sentence-Transformers).")
                return None
    print(f"  Toutes les {max_retries} tentatives ont échoué pour l'embedding (fin de boucle).")
    return None


# --- Main Processing Function ---

def process_pdfs_in_directory(directory):
    """Main processing pipeline for all PDFs in a directory."""
    all_embeddings_data = []
    processed_files = 0
    pdf_files = list(directory.glob("*.pdf"))
    total_files = len(pdf_files)

    if total_files == 0:
        print(f"Aucun fichier PDF trouvé dans le répertoire : {directory}")
        return []

    for pdf_file_path in pdf_files:
        processed_files += 1
        print(f"\nTraitement de {pdf_file_path.name} ({processed_files}/{total_files})...")

        page_data_list = extract_page_data_pymupdf(pdf_file_path)

        if not page_data_list:
            print(f"  Aucune donnée extraite de {pdf_file_path.name}.")
            continue

        for page_data in page_data_list:
            pdf_file = page_data['pdf_file']
            page_num = page_data['page_number']
            page_text = page_data['text']
            images = page_data['images']
            tables = page_data['tables']
            pdf_title = page_data.get('pdf_title')
            pdf_subject = page_data.get('pdf_subject')
            pdf_keywords = page_data.get('pdf_keywords')

            print(f"  Génération des descriptions et embeddings pour la page {page_num}...")

            # Process tables: Generate description and then embedding
            for table_idx, table in enumerate(tables):
                table_image_bytes = table.get('image_bytes')
                table_text_repr = table.get('table_text_representation', '')
                table_html_url = table.get('table_html_url')

                description = None
                if table_image_bytes:
                    prompt = "Décrivez en français le contenu et la structure de ce tableau. Mettez l'accent sur les données principales et les tendances si visibles."
                    print(f"  Page {page_num}: Génération de la description multimodale pour le tableau {table_idx}...")
                    description = generate_multimodal_description(table_image_bytes, prompt)
                elif table_text_repr:
                    # Fallback for text-only table description, using Gemma's text capabilities
                    if gemma_sampler:
                        prompt = f"Décrivez en français le contenu et la structure de ce tableau basé sur sa représentation textuelle:\n{table_text_repr[:1000]}..."
                        print(f"  Page {page_num}: Génération de la description textuelle pour le tableau {table_idx} (fallback via Gemma)...")
                        try:
                            # Gemma text-only generation
                            out = gemma_sampler.chat(prompt, max_tokens=500)
                            description = out.strip()
                        except Exception as e:
                            print(f"  Erreur lors de la génération de description textuelle pour le tableau {table_idx} via Gemma: {e}")
                            description = None
                    else:
                        print("  Skipping text description generation for table: Gemma sampler not initialized.")
                        description = None


                if description:
                    print(f"  Page {page_num}: Description générée pour le tableau {table_idx}.")
                    embedding_vector = generate_text_embedding(description)

                    if embedding_vector is not None:
                        chunk_data = {
                            "pdf_file": pdf_file,
                            "page_number": page_num,
                            "chunk_id": f"table_{table_idx}",
                            "content_type": "table",
                            "text_content": description,
                            "embedding": embedding_vector,
                            "table_html_url": table_html_url,
                            "image_url": table.get('image_url'),
                            "pdf_title": pdf_title,
                            "pdf_subject": pdf_subject,
                            "pdf_keywords": pdf_keywords
                        }
                        all_embeddings_data.append(chunk_data)
                        print(f"  Page {page_num}: Embedding généré pour la description du tableau {table_idx}.")
                    else:
                        print(f"  Page {page_num}: Échec de la génération de l'embedding pour la description du tableau {table_idx}. Chunk ignoré.")
                else:
                    print(f"  Page {page_num}: Aucune description générée pour le tableau {table_idx}. Chunk ignoré.")


            # Process images (non-table): Generate description and then embedding
            for img_idx, image in enumerate(images):
                image_bytes = image.get('image_bytes')
                image_url = image.get('image_url')

                if image_bytes:
                    prompt = "Décrivez en français le contenu de cette image. S'il s'agit d'un graphique, décrivez le type de graphique (histogramme, courbe, etc.), les axes, les légendes et les principales informations ou tendances visibles."
                    print(f"  Page {page_num}: Génération de la description multimodale pour l'image {img_idx}...")
                    description = generate_multimodal_description(image_bytes, prompt)

                    if description:
                        print(f"  Page {page_num}: Description générée pour l'image {img_idx}.")
                        embedding_vector = generate_text_embedding(description)

                        if embedding_vector is not None:
                            chunk_data = {
                                "pdf_file": pdf_file,
                                "page_number": page_num,
                                "chunk_id": f"image_{img_idx}",
                                "content_type": "image",
                                "text_content": description,
                                "embedding": embedding_vector,
                                "image_url": image_url,
                                "pdf_title": pdf_title,
                                "pdf_subject": pdf_subject,
                                "pdf_keywords": pdf_keywords
                            }
                            all_embeddings_data.append(chunk_data)
                            print(f"  Page {page_num}: Embedding généré pour la description de l'image {img_idx}.")
                        else:
                            print(f"  Page {page_num}: Échec de la génération de l'embedding pour la description de l'image {img_idx}. Chunk ignoré.")
                    else:
                        print(f"  Page {page_num}: Aucune description générée pour l'image {img_idx}. Chunk ignoré.")


            # Process regular text: Chunk and then generate embeddings
            if page_text:
                try:
                    encoding = tiktoken.get_encoding(ENCODING_NAME)
                    text_chunks = token_chunking(page_text, MAX_TOKENS_NORMAL, encoding)
                except Exception as e:
                    print(f"Erreur lors du chunking du texte de la page {page_num} : {e}. Utilisation du chunking simple.")
                    text_chunks = [page_text]


                for chunk_idx, chunk_content in enumerate(text_chunks):
                    print(f"  Page {page_num}: Génération de l'embedding pour le chunk de texte {chunk_idx}...")
                    embedding_vector = generate_text_embedding(chunk_content)

                    if embedding_vector is not None:
                        chunk_data = {
                            "pdf_file": pdf_file,
                            "page_number": page_num,
                            "chunk_id": f"text_{chunk_idx}",
                            "content_type": "text",
                            "text_content": chunk_content,
                            "embedding": embedding_vector,
                            "pdf_title": pdf_title,
                            "pdf_subject": pdf_subject,
                            "pdf_keywords": pdf_keywords
                        }
                        all_embeddings_data.append(chunk_data)
                        print(f"  Page {page_num}: Chunk de texte {chunk_idx} traité avec succès.")
                    else:
                        print(f"  Page {page_num}: Échec de la génération de l'embedding pour le chunk de texte {chunk_idx}. Chunk ignoré.")


            print(f"  Page {page_num} terminée. Éléments traités : {len(tables)} tableaux, {len(images)} images, {len(text_chunks)} chunks de texte.")


    return all_embeddings_data

# --- Main Execution ---
if __name__ == "__main__":
    print("Démarrage du traitement PDF multimodal avec génération de descriptions (Gemma) et embeddings textuels multilingues (Sentence-Transformers)...")

    # Validate and create directories
    if not PDF_DIRECTORY.is_dir():
        print(f"❌ ERREUR: Répertoire PDF non trouvé ou n'est pas un répertoire : {PDF_DIRECTORY}. Veuillez créer un répertoire 'docs' et y placer vos PDFs.")
        # Create it if it doesn't exist, for example PDF_DIRECTORY.mkdir(parents=True, exist_ok=True)
        # But for Colab, it's often better to instruct user to upload.
        exit(1)

    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    IMAGE_SAVE_DIR.mkdir(parents=True, exist_ok=True)
    TABLE_SAVE_DIR.mkdir(parents=True, exist_ok=True)
    print(f"Répertoire de sortie : {OUTPUT_DIR}")
    print(f"Répertoire de sauvegarde des images : {IMAGE_SAVE_DIR}")
    print(f"Répertoire de sauvegarde des tableaux (HTML) : {TABLE_SAVE_DIR}")

    # Initialize Gemma and Sentence-Transformers models
    initialize_models()

    # If models failed to initialize, exit
    if gemma_sampler is None or text_embedding_model is None:
        print("Impossible de continuer car un ou plusieurs modèles n'ont pas pu être initialisés.")
        exit(1)

    final_embeddings = process_pdfs_in_directory(PDF_DIRECTORY)

    if final_embeddings:
        print(f"\nTotal d'embeddings générés : {len(final_embeddings)}.")
        try:
            with EMBEDDINGS_FILE_PATH.open('w', encoding='utf-8') as f:
                json.dump(final_embeddings, f, indent=2, ensure_ascii=False)
            print(f"Embeddings sauvegardés avec succès dans : {EMBEDDINGS_FILE_PATH}")
        except Exception as e:
            print(f"\nErreur lors de la sauvegarde du fichier JSON d'embeddings : {e}")
            traceback.print_exc()
    else:
        print("\nAucun embedding n'a été généré.")