ADucatez commited on
Commit
2e8bfe7
·
1 Parent(s): 1519bb8

Remove all unstructured notations

Browse files
configs/test_full_workflow.yaml DELETED
@@ -1,16 +0,0 @@
1
- # Full workflow
2
- # Requires OpenAI API key and only works with table_extraction:Unstructured and r
3
-
4
- pagefilter:
5
- type: FromFilename
6
-
7
- table_extraction:
8
- - type: Unstructured
9
- params:
10
- pdf_image_dpi: 300
11
- hi_res_model_name: "yolox"
12
-
13
- table_cleaning:
14
- - type: LLM
15
- params:
16
- openai_model: "gpt-4-turbo-preview"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/v0.yaml CHANGED
@@ -10,11 +10,6 @@ table_extraction:
10
  - type: Camelot
11
  params:
12
  flavor: lattice
13
- - type: Unstructured
14
- params:
15
- hi_res_model_name: "yolox"
16
- pdf_image_dpi: 300
17
  # - type: LLamaParse
18
- # - type: UnstructuredAPI
19
 
20
  # table_cleaning:
 
10
  - type: Camelot
11
  params:
12
  flavor: lattice
 
 
 
 
13
  # - type: LLamaParse
 
14
 
15
  # table_cleaning:
country_by_country/table_extraction/unstructured.py DELETED
@@ -1,69 +0,0 @@
1
- # MIT License
2
- #
3
- # Copyright (c) 2024 dataforgood
4
- #
5
- # Permission is hereby granted, free of charge, to any person obtaining a copy
6
- # of this software and associated documentation files (the "Software"), to deal
7
- # in the Software without restriction, including without limitation the rights
8
- # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- # copies of the Software, and to permit persons to whom the Software is
10
- # furnished to do so, subject to the following conditions:
11
- #
12
- # The above copyright notice and this permission notice shall be included in all
13
- # copies or substantial portions of the Software.
14
- #
15
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- # SOFTWARE.
22
-
23
- # Standard imports
24
- import logging
25
- import uuid
26
-
27
- # External imports
28
- from io import StringIO
29
-
30
- import pandas as pd
31
- from unstructured.partition.pdf import partition_pdf
32
-
33
-
34
- class Unstructured:
35
- def __init__(self, **kwargs: dict) -> dict:
36
- """
37
- Builds a pdf page parser, looking for tables using
38
- the unstructured library.
39
- The kwargs given to the constructor are directly propagated
40
- to the partition_pdf function.
41
- You are free to define any parameter partition_pdf recognizes
42
- """
43
- self.kwargs = kwargs
44
- self.type = "unstructured"
45
-
46
- def __call__(self, pdf_filepath: str) -> dict:
47
- logging.info("\nKicking off extraction stage...")
48
- logging.info(f"Extraction type: {self.type}, with params: {self.kwargs}")
49
-
50
- elements = partition_pdf(
51
- pdf_filepath,
52
- infer_table_structure=True,
53
- strategy="hi_res",
54
- **self.kwargs,
55
- )
56
- tables_list = [el for el in elements if el.category == "Table"]
57
- tables_list = [
58
- pd.read_html(StringIO(t.metadata.text_as_html))[0] for t in tables_list
59
- ]
60
-
61
- # Create asset
62
- new_asset = {
63
- "id": uuid.uuid4(),
64
- "type": "unstructured",
65
- "params": self.kwargs,
66
- "tables": tables_list,
67
- }
68
-
69
- return new_asset
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
country_by_country/table_extraction/unstructured_api.py DELETED
@@ -1,98 +0,0 @@
1
- # MIT License
2
- #
3
- # Copyright (c) 2024 dataforgood
4
- #
5
- # Permission is hereby granted, free of charge, to any person obtaining a copy
6
- # of this software and associated documentation files (the "Software"), to deal
7
- # in the Software without restriction, including without limitation the rights
8
- # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- # copies of the Software, and to permit persons to whom the Software is
10
- # furnished to do so, subject to the following conditions:
11
- #
12
- # The above copyright notice and this permission notice shall be included in all
13
- # copies or substantial portions of the Software.
14
- #
15
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- # SOFTWARE.
22
-
23
- # Standard imports
24
- import logging
25
-
26
- # External imports
27
- import os
28
- import uuid
29
- from io import StringIO
30
- from pathlib import Path
31
-
32
- import pandas as pd
33
- from unstructured_client import UnstructuredClient
34
- from unstructured_client.models import shared
35
-
36
-
37
- class UnstructuredAPI:
38
- def __init__(self, **kwargs: dict) -> dict:
39
- """
40
- Builds a pdf page parser, looking for tables using
41
- the unstructured.io api.
42
- The kwargs given to the constructor are directly propagated
43
- to the partition_pdf function.
44
- You are free to define any parameter partition_pdf recognizes
45
- """
46
- self.kwargs = kwargs
47
- self.type = "unstructured_api"
48
-
49
- def __call__(self, pdf_filepath: str) -> dict:
50
- logging.info("\nKicking off extraction stage...")
51
- logging.info(f"Extraction type: {self.type}, with params: {self.kwargs}")
52
-
53
- s = UnstructuredClient(api_key_auth=os.getenv("UNSTRUCTURED_API_KEY"))
54
-
55
- with Path(pdf_filepath).open("rb") as f:
56
- # Note that this currently only supports a single file
57
- files = shared.Files(
58
- content=f.read(),
59
- file_name=pdf_filepath,
60
- )
61
-
62
- req = shared.PartitionParameters(
63
- files=files,
64
- strategy="hi_res",
65
- pdf_infer_table_structure="True",
66
- **self.kwargs,
67
- )
68
-
69
- try:
70
- resp = s.general.partition(req)
71
- except Exception as e:
72
- print(e)
73
- else:
74
- tables_list = []
75
- for el in resp.elements:
76
- if el["type"] == "Table":
77
- # Enclose in try block to ignore case when pandas can't read the table
78
- # Happens when the html is incorrectly formatted
79
- try:
80
- table = pd.read_html(StringIO(el["metadata"]["text_as_html"]))[
81
- 0
82
- ]
83
- except Exception:
84
- logging.info(
85
- "Html table discarded. Pandas couldn't read the table.",
86
- )
87
- else:
88
- tables_list.append(table)
89
-
90
- # Create asset
91
- new_asset = {
92
- "id": uuid.uuid4(),
93
- "type": "unstructured_api",
94
- "params": self.kwargs,
95
- "tables": tables_list,
96
- }
97
-
98
- return new_asset