Spaces:
Sleeping
Sleeping
File size: 2,282 Bytes
4107940 dd7488f 1b47089 4107940 1b47089 4107940 dd7488f 8de7c36 dd7488f 8de7c36 1b47089 4107940 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
from io import StringIO
import core.pipelines as pipelines_functions
from inspect import getmembers, isfunction
from newspaper import Article
from PyPDF2 import PdfFileReader
import streamlit as st
import pandas as pd
def get_pipelines():
pipeline_names, pipeline_funcs = list(
zip(*getmembers(pipelines_functions, isfunction))
)
pipeline_names = [
" ".join([n.capitalize() for n in name.split("_")]) for name in pipeline_names
]
return pipeline_names, pipeline_funcs
@st.experimental_memo
def extract_text_from_url(url: str):
article = Article(url)
article.download()
article.parse()
return article.text
def extract_text_from_file(file):
# read text file
if file.type == "text/plain":
# To convert to a string based IO:
stringio = StringIO(file.getvalue().decode("utf-8"))
# To read file as string:
file_text = stringio.read()
return file_text
# read pdf file
elif file.type == "application/pdf":
pdfReader = PdfFileReader(file)
count = pdfReader.numPages
all_text = ""
for i in range(count):
try:
page = pdfReader.getPage(i)
all_text += page.extractText()
except:
continue
file_text = all_text
return file_text
# read csv file
elif file.type == "text/csv":
csv = pd.read_csv(file)
# get columns of type string
string_columns = csv.select_dtypes(include=['object']).columns
# get data from columns and join it together
file_text = ""
for row in csv[string_columns].values.tolist():
# remove NaNs
row = [x for x in row if str(x) != 'nan']
for column in row:
txt = ""
if isinstance(column, list):
try:
txt = " ".join(column)
except:
continue
elif isinstance(column, str):
txt = column
else:
continue
file_text += " " + txt
return file_text
else:
st.warning(f"File type {file.type} not supported")
return None
|