Commit
·
96ac47b
1
Parent(s):
bbf844d
Made example display conditional on example file existence. Turned example display off by default. Removed (mostly) unnecessary multi-os-test workflow
Browse files- .github/workflows/{multi-os-test.yml → archive_workflows/multi-os-test.yml} +0 -0
- app.py +178 -123
- tools/config.py +1 -1
.github/workflows/{multi-os-test.yml → archive_workflows/multi-os-test.yml}
RENAMED
File without changes
|
app.py
CHANGED
@@ -1008,70 +1008,88 @@ with app:
|
|
1008 |
###
|
1009 |
with gr.Tab("Redact PDFs/images"):
|
1010 |
|
|
|
1011 |
# Examples for PDF/image redaction
|
1012 |
if SHOW_EXAMPLES == "True":
|
1013 |
gr.Markdown(
|
1014 |
"### Try an example - Click on an example below and then the 'Extract text and redact document' button:"
|
1015 |
)
|
1016 |
-
|
1017 |
-
|
1018 |
-
|
1019 |
-
|
1020 |
-
|
1021 |
-
|
1022 |
-
|
1023 |
-
|
1024 |
-
|
1025 |
-
|
1026 |
-
|
1027 |
-
|
1028 |
-
|
1029 |
-
|
1030 |
-
|
1031 |
-
[
|
1032 |
-
|
1033 |
-
|
1034 |
-
|
1035 |
-
|
1036 |
-
|
1037 |
-
|
1038 |
-
|
1039 |
-
|
1040 |
-
|
1041 |
-
|
1042 |
-
|
1043 |
-
|
1044 |
-
|
1045 |
-
|
1046 |
-
|
1047 |
-
|
1048 |
-
|
1049 |
-
[
|
1050 |
-
|
1051 |
-
|
1052 |
-
|
1053 |
-
|
1054 |
-
|
1055 |
-
|
1056 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1057 |
],
|
1058 |
-
|
1059 |
-
|
1060 |
-
in_doc_files,
|
1061 |
-
text_extract_method_radio,
|
1062 |
-
pii_identification_method_drop,
|
1063 |
-
handwrite_signature_checkbox,
|
1064 |
-
in_redact_entities,
|
1065 |
-
in_redact_comprehend_entities,
|
1066 |
-
prepared_pdf_state,
|
1067 |
-
],
|
1068 |
-
example_labels=[
|
1069 |
-
"PDF with selectable text redaction",
|
1070 |
-
"Image redaction with local OCR",
|
1071 |
-
"PDF redaction with custom entities (TITLES, PERSON, DATE_TIME)",
|
1072 |
-
"PDF redaction with AWS services and signature detection",
|
1073 |
-
],
|
1074 |
-
)
|
1075 |
|
1076 |
with gr.Accordion("Redact document", open=True):
|
1077 |
# in_doc_files = gr.File(
|
@@ -1651,36 +1669,37 @@ with app:
|
|
1651 |
gr.Markdown(
|
1652 |
"### Try an example - Click on an example below and then the 'Identify duplicate pages/subdocuments' button:"
|
1653 |
)
|
1654 |
-
|
1655 |
-
|
1656 |
-
|
|
|
|
|
|
|
|
|
1657 |
[
|
1658 |
-
|
|
|
|
|
|
|
1659 |
],
|
1660 |
-
0.95,
|
1661 |
-
10,
|
1662 |
-
True,
|
1663 |
-
],
|
1664 |
-
[
|
1665 |
[
|
1666 |
-
|
|
|
|
|
|
|
1667 |
],
|
1668 |
-
0.95,
|
1669 |
-
3,
|
1670 |
-
False,
|
1671 |
],
|
1672 |
-
|
1673 |
-
|
1674 |
-
|
1675 |
-
|
1676 |
-
|
1677 |
-
|
1678 |
-
|
1679 |
-
|
1680 |
-
|
1681 |
-
|
1682 |
-
|
1683 |
-
)
|
1684 |
|
1685 |
with gr.Accordion("Step 1: Configure and run analysis", open=True):
|
1686 |
# in_duplicate_pages = gr.File(
|
@@ -1810,47 +1829,65 @@ with app:
|
|
1810 |
gr.Markdown(
|
1811 |
"### Try an example - Click on an example below and then the 'Redact text/data files' button for redaction, or the 'Find duplicate cells/rows' button for duplicate detection:"
|
1812 |
)
|
1813 |
-
|
1814 |
-
|
1815 |
-
|
1816 |
-
|
1817 |
-
|
1818 |
-
|
1819 |
-
|
1820 |
-
|
1821 |
-
|
1822 |
-
|
1823 |
-
|
1824 |
-
|
1825 |
-
|
1826 |
-
|
1827 |
-
|
1828 |
-
|
1829 |
-
|
1830 |
-
|
1831 |
-
[
|
1832 |
-
|
1833 |
-
|
1834 |
-
|
1835 |
-
|
1836 |
-
|
1837 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1838 |
],
|
1839 |
-
|
1840 |
-
|
1841 |
-
in_data_files,
|
1842 |
-
in_colnames,
|
1843 |
-
pii_identification_method_drop_tabular,
|
1844 |
-
anon_strategy,
|
1845 |
-
in_tabular_duplicate_files,
|
1846 |
-
tabular_text_columns,
|
1847 |
-
],
|
1848 |
-
example_labels=[
|
1849 |
-
"CSV file redaction with specific columns - remove text",
|
1850 |
-
"Word document redaction - replace with REDACTED",
|
1851 |
-
"Tabular duplicate detection in CSV files",
|
1852 |
-
],
|
1853 |
-
)
|
1854 |
|
1855 |
with gr.Accordion("Redact Word or Excel/csv files", open=True):
|
1856 |
with gr.Accordion("Upload docx, xlsx, or csv files", open=True):
|
@@ -6385,3 +6422,21 @@ if __name__ == "__main__":
|
|
6385 |
|
6386 |
# Run the CLI main function with direct mode arguments
|
6387 |
main(direct_mode_args=direct_mode_args)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1008 |
###
|
1009 |
with gr.Tab("Redact PDFs/images"):
|
1010 |
|
1011 |
+
# Examples for PDF/image redaction
|
1012 |
# Examples for PDF/image redaction
|
1013 |
if SHOW_EXAMPLES == "True":
|
1014 |
gr.Markdown(
|
1015 |
"### Try an example - Click on an example below and then the 'Extract text and redact document' button:"
|
1016 |
)
|
1017 |
+
|
1018 |
+
# Check which example files exist and create examples only for available files
|
1019 |
+
example_files = [
|
1020 |
+
"example_data/example_of_emails_sent_to_a_professor_before_applying.pdf",
|
1021 |
+
"example_data/example_complaint_letter.jpg",
|
1022 |
+
"example_data/graduate-job-example-cover-letter.pdf",
|
1023 |
+
"example_data/Partnership-Agreement-Toolkit_0_0.pdf"
|
1024 |
+
]
|
1025 |
+
|
1026 |
+
available_examples = []
|
1027 |
+
example_labels = []
|
1028 |
+
|
1029 |
+
# Check each example file and add to examples if it exists
|
1030 |
+
if os.path.exists(example_files[0]):
|
1031 |
+
available_examples.append([
|
1032 |
+
[example_files[0]],
|
1033 |
+
"Local model - selectable text",
|
1034 |
+
"Local",
|
1035 |
+
[],
|
1036 |
+
CHOSEN_REDACT_ENTITIES,
|
1037 |
+
CHOSEN_COMPREHEND_ENTITIES,
|
1038 |
+
[example_files[0]],
|
1039 |
+
])
|
1040 |
+
example_labels.append("PDF with selectable text redaction")
|
1041 |
+
|
1042 |
+
if os.path.exists(example_files[1]):
|
1043 |
+
available_examples.append([
|
1044 |
+
[example_files[1]],
|
1045 |
+
"Local OCR model - PDFs without selectable text",
|
1046 |
+
"Local",
|
1047 |
+
[],
|
1048 |
+
CHOSEN_REDACT_ENTITIES,
|
1049 |
+
CHOSEN_COMPREHEND_ENTITIES,
|
1050 |
+
[example_files[1]],
|
1051 |
+
])
|
1052 |
+
example_labels.append("Image redaction with local OCR")
|
1053 |
+
|
1054 |
+
if os.path.exists(example_files[2]):
|
1055 |
+
available_examples.append([
|
1056 |
+
[example_files[2]],
|
1057 |
+
"Local OCR model - PDFs without selectable text",
|
1058 |
+
"Local",
|
1059 |
+
[],
|
1060 |
+
["TITLES", "PERSON", "DATE_TIME"],
|
1061 |
+
CHOSEN_COMPREHEND_ENTITIES,
|
1062 |
+
[example_files[2]],
|
1063 |
+
])
|
1064 |
+
example_labels.append("PDF redaction with custom entities (TITLES, PERSON, DATE_TIME)")
|
1065 |
+
|
1066 |
+
if os.path.exists(example_files[3]):
|
1067 |
+
available_examples.append([
|
1068 |
+
[example_files[3]],
|
1069 |
+
"AWS Textract service - all PDF types",
|
1070 |
+
"AWS Comprehend",
|
1071 |
+
["Extract handwriting", "Extract signatures"],
|
1072 |
+
CHOSEN_REDACT_ENTITIES,
|
1073 |
+
CHOSEN_COMPREHEND_ENTITIES,
|
1074 |
+
[example_files[3]],
|
1075 |
+
])
|
1076 |
+
example_labels.append("PDF redaction with AWS services and signature detection")
|
1077 |
+
|
1078 |
+
# Only create examples if we have available files
|
1079 |
+
if available_examples:
|
1080 |
+
redaction_examples = gr.Examples(
|
1081 |
+
examples=available_examples,
|
1082 |
+
inputs=[
|
1083 |
+
in_doc_files,
|
1084 |
+
text_extract_method_radio,
|
1085 |
+
pii_identification_method_drop,
|
1086 |
+
handwrite_signature_checkbox,
|
1087 |
+
in_redact_entities,
|
1088 |
+
in_redact_comprehend_entities,
|
1089 |
+
prepared_pdf_state,
|
1090 |
],
|
1091 |
+
example_labels=example_labels,
|
1092 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1093 |
|
1094 |
with gr.Accordion("Redact document", open=True):
|
1095 |
# in_doc_files = gr.File(
|
|
|
1669 |
gr.Markdown(
|
1670 |
"### Try an example - Click on an example below and then the 'Identify duplicate pages/subdocuments' button:"
|
1671 |
)
|
1672 |
+
|
1673 |
+
# Check if duplicate example file exists
|
1674 |
+
duplicate_example_file = "example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv"
|
1675 |
+
|
1676 |
+
if os.path.exists(duplicate_example_file):
|
1677 |
+
duplicate_examples = gr.Examples(
|
1678 |
+
examples=[
|
1679 |
[
|
1680 |
+
[duplicate_example_file],
|
1681 |
+
0.95,
|
1682 |
+
10,
|
1683 |
+
True,
|
1684 |
],
|
|
|
|
|
|
|
|
|
|
|
1685 |
[
|
1686 |
+
[duplicate_example_file],
|
1687 |
+
0.95,
|
1688 |
+
3,
|
1689 |
+
False,
|
1690 |
],
|
|
|
|
|
|
|
1691 |
],
|
1692 |
+
inputs=[
|
1693 |
+
in_duplicate_pages,
|
1694 |
+
duplicate_threshold_input,
|
1695 |
+
min_word_count_input,
|
1696 |
+
combine_page_text_for_duplicates_bool,
|
1697 |
+
],
|
1698 |
+
example_labels=[
|
1699 |
+
"Find duplicate pages of text in document OCR outputs",
|
1700 |
+
"Find duplicate text lines in document OCR outputs",
|
1701 |
+
],
|
1702 |
+
)
|
|
|
1703 |
|
1704 |
with gr.Accordion("Step 1: Configure and run analysis", open=True):
|
1705 |
# in_duplicate_pages = gr.File(
|
|
|
1829 |
gr.Markdown(
|
1830 |
"### Try an example - Click on an example below and then the 'Redact text/data files' button for redaction, or the 'Find duplicate cells/rows' button for duplicate detection:"
|
1831 |
)
|
1832 |
+
|
1833 |
+
# Check which tabular example files exist
|
1834 |
+
tabular_example_files = [
|
1835 |
+
"example_data/combined_case_notes.csv",
|
1836 |
+
"example_data/Bold minimalist professional cover letter.docx",
|
1837 |
+
"example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv"
|
1838 |
+
]
|
1839 |
+
|
1840 |
+
available_tabular_examples = []
|
1841 |
+
tabular_example_labels = []
|
1842 |
+
|
1843 |
+
# Check each tabular example file and add to examples if it exists
|
1844 |
+
if os.path.exists(tabular_example_files[0]):
|
1845 |
+
available_tabular_examples.append([
|
1846 |
+
[tabular_example_files[0]],
|
1847 |
+
["Case Note", "Client"],
|
1848 |
+
"Local",
|
1849 |
+
"replace with 'REDACTED'",
|
1850 |
+
[tabular_example_files[0]],
|
1851 |
+
["Case Note"],
|
1852 |
+
])
|
1853 |
+
tabular_example_labels.append("CSV file redaction with specific columns - remove text")
|
1854 |
+
|
1855 |
+
if os.path.exists(tabular_example_files[1]):
|
1856 |
+
available_tabular_examples.append([
|
1857 |
+
[tabular_example_files[1]],
|
1858 |
+
[],
|
1859 |
+
"Local",
|
1860 |
+
"replace with 'REDACTED'",
|
1861 |
+
[],
|
1862 |
+
[],
|
1863 |
+
])
|
1864 |
+
tabular_example_labels.append("Word document redaction - replace with REDACTED")
|
1865 |
+
|
1866 |
+
if os.path.exists(tabular_example_files[2]):
|
1867 |
+
available_tabular_examples.append([
|
1868 |
+
[tabular_example_files[2]],
|
1869 |
+
["text"],
|
1870 |
+
"Local",
|
1871 |
+
"replace with 'REDACTED'",
|
1872 |
+
[tabular_example_files[2]],
|
1873 |
+
["text"],
|
1874 |
+
])
|
1875 |
+
tabular_example_labels.append("Tabular duplicate detection in CSV files")
|
1876 |
+
|
1877 |
+
# Only create examples if we have available files
|
1878 |
+
if available_tabular_examples:
|
1879 |
+
tabular_examples = gr.Examples(
|
1880 |
+
examples=available_tabular_examples,
|
1881 |
+
inputs=[
|
1882 |
+
in_data_files,
|
1883 |
+
in_colnames,
|
1884 |
+
pii_identification_method_drop_tabular,
|
1885 |
+
anon_strategy,
|
1886 |
+
in_tabular_duplicate_files,
|
1887 |
+
tabular_text_columns,
|
1888 |
],
|
1889 |
+
example_labels=tabular_example_labels,
|
1890 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1891 |
|
1892 |
with gr.Accordion("Redact Word or Excel/csv files", open=True):
|
1893 |
with gr.Accordion("Upload docx, xlsx, or csv files", open=True):
|
|
|
6422 |
|
6423 |
# Run the CLI main function with direct mode arguments
|
6424 |
main(direct_mode_args=direct_mode_args)
|
6425 |
+
|
6426 |
+
|
6427 |
+
# Combine extraction options
|
6428 |
+
extraction_options = (
|
6429 |
+
list(direct_mode_args["handwrite_signature_extraction"])
|
6430 |
+
if direct_mode_args["handwrite_signature_extraction"]
|
6431 |
+
else []
|
6432 |
+
)
|
6433 |
+
if direct_mode_args["extract_forms"]:
|
6434 |
+
extraction_options.append("Extract forms")
|
6435 |
+
if direct_mode_args["extract_tables"]:
|
6436 |
+
extraction_options.append("Extract tables")
|
6437 |
+
if direct_mode_args["extract_layout"]:
|
6438 |
+
extraction_options.append("Extract layout")
|
6439 |
+
direct_mode_args["handwrite_signature_extraction"] = extraction_options
|
6440 |
+
|
6441 |
+
# Run the CLI main function with direct mode arguments
|
6442 |
+
main(direct_mode_args=direct_mode_args)
|
tools/config.py
CHANGED
@@ -543,7 +543,7 @@ except Exception as e:
|
|
543 |
# Get some environment variables and Launch the Gradio app
|
544 |
COGNITO_AUTH = get_or_create_env_var("COGNITO_AUTH", "0")
|
545 |
|
546 |
-
SHOW_EXAMPLES = get_or_create_env_var("SHOW_EXAMPLES", "
|
547 |
|
548 |
RUN_DIRECT_MODE = get_or_create_env_var("RUN_DIRECT_MODE", "0")
|
549 |
|
|
|
543 |
# Get some environment variables and Launch the Gradio app
|
544 |
COGNITO_AUTH = get_or_create_env_var("COGNITO_AUTH", "0")
|
545 |
|
546 |
+
SHOW_EXAMPLES = get_or_create_env_var("SHOW_EXAMPLES", "False")
|
547 |
|
548 |
RUN_DIRECT_MODE = get_or_create_env_var("RUN_DIRECT_MODE", "0")
|
549 |
|