Spaces:
Sleeping
Sleeping
File size: 3,385 Bytes
57cf043 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
import logging
import pickle
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Callable
import numpy as np
import pandas as pd
from common.constants import UNKNOWN
from components.embedding_extraction import EmbeddingExtractor
logger = logging.getLogger(__name__)
@dataclass
class DatasetRow:
"""
Класс для хранения данных одной строки датасета.
"""
Index: int
Text: str
DocName: str
Title: str
DocNumber: str
LevelParagraph: str = field(default=UNKNOWN)
Pargaraph: str = field(default=UNKNOWN)
Duplicate: str = field(default=UNKNOWN)
PartLevel1: str = field(default=UNKNOWN)
PartLevel2: str = field(default=UNKNOWN)
Appendix: str = field(default=UNKNOWN)
LevelParagraphAppendix: str = field(default=UNKNOWN)
PargaraphAppendix: str = field(default=UNKNOWN)
DuplicateAppendix: str = field(default=UNKNOWN)
PartLevel1Appendix: str = field(default=UNKNOWN)
Table: str = field(default=UNKNOWN)
class DocumentsDataset:
"""
Класс для хранения данных датасета.
Содержит список строк и векторы текстов.
Изначально не содержит векторов, чтобы запустить процесс векторизации,
нужно вызвать метод vectorize_with.
"""
def __init__(self, rows: list[DatasetRow]):
self.rows = rows
self.vectors: np.ndarray | None = None
def vectorize_with(
self,
vectorizer: EmbeddingExtractor,
progress_callback: Callable[[int, int], None] | None = None,
) -> None:
"""
Векторизация текстов в датасете.
"""
logger.info('Starting dataset vectorization')
total = len(self.rows)
rows = [row.Text for row in self.rows]
vectors = vectorizer.vectorize(rows, progress_callback)
self.vectors = vectors
logger.info(f'Completed vectorization of {total} rows')
def to_pandas(self) -> pd.DataFrame:
"""
Преобразовать датасет в pandas DataFrame.
Returns:
pd.DataFrame: Датафрейм с данными.
"""
df = pd.DataFrame([asdict(row) for row in self.rows])
if self.vectors is not None:
df['Embedding'] = self.vectors.tolist()
else:
df['Embedding'] = np.nan
return df
def to_pickle(self, path: Path) -> None:
"""
Сохранение датасета в pickle файл.
"""
logger.info(f'Saving dataset to {path}')
with open(path, 'wb') as f:
pickle.dump(self.to_pandas(), f)
logger.info('Dataset saved successfully')
@classmethod
def from_pickle(cls, path: Path) -> 'DocumentsDataset':
"""
Загрузка датасета из pickle файла.
"""
logger.info(f'Loading dataset from {path}')
try:
with open(path, 'rb') as f:
dataset = pickle.load(f)
logger.info(f'Loaded dataset with {len(dataset.rows)} rows')
return dataset
except Exception as e:
logger.error(f'Failed to load dataset: {e}')
raise
|