Spaces:
Sleeping
Sleeping
@startuml NTR_FileParser | |
package "ntr_fileparser" { | |
package "data_classes" { | |
abstract class ParsedStructure { | |
+{abstract} apply(func: Callable[[str], str]) | |
+{abstract} to_dict() | |
+{abstract} to_string() | |
} | |
class ParsedDocument { | |
+name: str | |
+type: str | |
+meta: ParsedMeta | |
+paragraphs: list[ParsedTextBlock] | |
+tables: list[ParsedTable] | |
+images: list[ParsedImage] | |
+formulas: list[ParsedFormula] | |
} | |
class ParsedMeta { | |
+title: str | |
+author: str | |
+creation_date: str | |
} | |
class ParsedTextBlock { | |
+text: str | |
+style: TextStyle | |
} | |
enum TextStyle { | |
NORMAL | |
BOLD | |
ITALIC | |
UNDERLINE | |
HEADING1 | |
HEADING2 | |
HEADING3 | |
} | |
class ParsedTable { | |
+headers: list[str] | |
+rows: list[ParsedRow] | |
+subtables: list[ParsedSubtable] | |
+tag: TableTag | |
} | |
class ParsedRow { | |
+cells: list[str] | |
} | |
class ParsedSubtable { | |
+table: ParsedTable | |
} | |
enum TableTag { | |
UNKNOWN | |
DATA | |
METADATA | |
} | |
class ParsedImage | |
+path: str | |
+alt_text: str | |
.. Примечание .. | |
В текущей реализации не используется | |
} | |
class ParsedFormula | |
+latex: str | |
.. Примечание .. | |
В текущей реализации не используется | |
} | |
ParsedStructure <|-- ParsedDocument | |
ParsedStructure <|-- ParsedTextBlock | |
ParsedStructure <|-- ParsedTable | |
ParsedStructure <|-- ParsedRow | |
ParsedStructure <|-- ParsedSubtable | |
ParsedStructure <|-- ParsedImage | |
ParsedStructure <|-- ParsedFormula | |
ParsedStructure <|-- ParsedMeta | |
ParsedDocument o-- ParsedMeta | |
ParsedDocument o-- "*" ParsedTextBlock | |
ParsedDocument o-- "*" ParsedTable | |
ParsedDocument o-- "*" ParsedImage | |
ParsedDocument o-- "*" ParsedFormula | |
ParsedTable o-- "*" ParsedRow | |
ParsedTable o-- "*" ParsedSubtable | |
ParsedTable -- TableTag | |
ParsedTextBlock -- TextStyle | |
} | |
package "parsers" { | |
abstract class AbstractParser { | |
+file_types: list | |
+{abstract} parse() | |
+{abstract} parse_by_path() | |
+supports_file() | |
+_supported_extension() | |
} | |
class ParserFactory { | |
+parsers: list[AbstractParser] | |
+register_parser() | |
+get_parser() | |
} | |
class UniversalParser { | |
+factory: ParserFactory | |
+parse() | |
+parse_by_path() | |
} | |
enum FileType { | |
XML | |
DOCX | |
DOC | |
HTML | |
MD | |
EML | |
+from_extension() | |
+get_supported_extensions() | |
} | |
package "specific_parsers" { | |
package "xml" { | |
class XMLParagraphParser { | |
+parse() | |
} | |
class XMLTableParser { | |
+parse() | |
} | |
class XMLMetaParser { | |
+parse() | |
+_extract_info_value() | |
+_extract_info_recurse() | |
} | |
class XMLImageParser | |
+parse() | |
.. Примечание .. | |
В текущей реализации не используется | |
} | |
class XMLFormulaParser | |
+parse() | |
.. Примечание .. | |
В текущей реализации не используется | |
} | |
} | |
package "docx" { | |
class CorePropertiesParser { | |
+parse() | |
} | |
class MetadataParser { | |
+parse() | |
} | |
class NumberingParser { | |
+parse() | |
} | |
class RelationshipsParser { | |
+parse() | |
} | |
class StylesParser { | |
+parse() | |
} | |
} | |
class DocParser { | |
} | |
class DocxParser { | |
} | |
class PDFParser { | |
} | |
class XMLParser { | |
} | |
class HTMLParser { | |
} | |
class MarkdownParser { | |
} | |
class EmailParser { | |
} | |
XMLParser -- xml | |
DocxParser -- docx | |
} | |
AbstractParser <|-- DocParser | |
AbstractParser <|-- DocxParser | |
AbstractParser <|-- PDFParser | |
AbstractParser <|-- XMLParser | |
AbstractParser <|-- HTMLParser | |
AbstractParser <|-- MarkdownParser | |
AbstractParser <|-- EmailParser | |
AbstractParser -- FileType | |
ParserFactory o-- "*" AbstractParser | |
UniversalParser --> ParserFactory | |
} | |
data_classes <.. parsers : использует | |
} | |
@enduml |