Spaces:
Runtime error
Runtime error
Init
Browse files- .gitattributes +2 -27
- .gitignore +1 -0
- README.md +4 -4
- app.py +39 -0
- data/benefit_vectors.json +3 -0
- data/condition_vectors.json +3 -0
- data/health_aspects.json +3 -0
- data/img/Jellymation.gif +3 -0
- data/products.json +3 -0
- requirements.txt +8 -0
- style.css +58 -0
- support_functions.py +296 -0
- visualize_dataset.py +128 -0
- visualize_pipeline.py +128 -0
.gitattributes
CHANGED
|
@@ -1,27 +1,2 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
+
data/*.json filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
data/img/*.gif filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
__pycache__
|
README.md
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
---
|
| 2 |
-
title: Healthsea
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: streamlit
|
| 7 |
app_file: app.py
|
| 8 |
pinned: false
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Healthsea
|
| 3 |
+
emoji: 🪐
|
| 4 |
+
colorFrom: yellow
|
| 5 |
+
colorTo: pink
|
| 6 |
sdk: streamlit
|
| 7 |
app_file: app.py
|
| 8 |
pinned: false
|
app.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from visualize_dataset import visualize_dataset
|
| 3 |
+
from visualize_pipeline import visualize_pipeline
|
| 4 |
+
|
| 5 |
+
# Header
|
| 6 |
+
with open("style.css") as f:
|
| 7 |
+
st.markdown("<style>" + f.read() + "</style>", unsafe_allow_html=True)
|
| 8 |
+
|
| 9 |
+
st.title("Welcome to Healthsea 🪐")
|
| 10 |
+
|
| 11 |
+
intro, jellyfish = st.columns(2)
|
| 12 |
+
jellyfish.markdown("\n")
|
| 13 |
+
|
| 14 |
+
data_load_state = intro.subheader("Create easier access to health✨")
|
| 15 |
+
|
| 16 |
+
jellyfish.image("data/img/Jellymation.gif")
|
| 17 |
+
intro.markdown(
|
| 18 |
+
"Healthsea is a spaCy v3 pipeline that analyzes user reviews to supplement products by extracting their effects on health."
|
| 19 |
+
)
|
| 20 |
+
intro.markdown(
|
| 21 |
+
"""With this app, you're able to explore the results of healthsea on up to 1 million reviews.
|
| 22 |
+
You can search for any health aspect, whether it is an disease (e.g. joint pain) or a desired health effect such as (e.g. energy),
|
| 23 |
+
the app returns a list of the best products and substances. You can also explore the capabilities of the pipeline itself, by writing custom reviews and
|
| 24 |
+
see every processing step of the pipeline.
|
| 25 |
+
"""
|
| 26 |
+
)
|
| 27 |
+
intro.markdown(
|
| 28 |
+
"""If you want to learn more about healthsea, you can read more in our [blog post]().
|
| 29 |
+
"""
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
st.markdown("""---""")
|
| 33 |
+
|
| 34 |
+
app_type = st.selectbox("Choose app", ["Visualize dataset", "Visualize pipeline"])
|
| 35 |
+
|
| 36 |
+
if app_type == "Visualize dataset":
|
| 37 |
+
visualize_dataset()
|
| 38 |
+
else:
|
| 39 |
+
visualize_pipeline()
|
data/benefit_vectors.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c77f19346af726d403cb571589e9d5802385c665dfb358a86591ebdd5c43e084
|
| 3 |
+
size 53173260
|
data/condition_vectors.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1d8700f555d2fb6c643bead407f97ee14ebaa8e1d491a16af92026c719a3d91b
|
| 3 |
+
size 192093565
|
data/health_aspects.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:09840d8b5e503a8f62bd4bcc6455348453f111321cc108be1f115a550a34757a
|
| 3 |
+
size 23936080
|
data/img/Jellymation.gif
ADDED
|
Git LFS Details
|
data/products.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:19606c9ad43abb4e9b7b679e9229b2c2101b5a748de4b5ba2c3baec4fde2f73f
|
| 3 |
+
size 56608006
|
requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit>=1.2.0
|
| 2 |
+
plotly>=5.4.0
|
| 3 |
+
scikit-learn>=1.0.1
|
| 4 |
+
spacy-streamlit>=1.0.2
|
| 5 |
+
spacy>=3.1.4
|
| 6 |
+
benepar>=0.2.0
|
| 7 |
+
|
| 8 |
+
https://huggingface.co/edichief/en_healthsea/resolve/main/en_healthsea-any-py3-none-any.whl
|
style.css
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.kpi{
|
| 2 |
+
text-align: center;
|
| 3 |
+
border-style: solid;
|
| 4 |
+
border-width: 1px;
|
| 5 |
+
border-radius: 5px;
|
| 6 |
+
border-color: #3b3b4d;
|
| 7 |
+
box-shadow: 0px 5px #3b3b4d;
|
| 8 |
+
}
|
| 9 |
+
|
| 10 |
+
.kpi:hover {
|
| 11 |
+
transform: scale(1.1);
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
.central_text{
|
| 15 |
+
text-align: center;
|
| 16 |
+
top: 50%;
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
.clause{
|
| 20 |
+
text-align: center;
|
| 21 |
+
border-style: solid;
|
| 22 |
+
border-width: 1px;
|
| 23 |
+
border-radius: 5px;
|
| 24 |
+
border-color: #1B7735;
|
| 25 |
+
box-shadow: 0px 5px #1B7735;
|
| 26 |
+
color: white;
|
| 27 |
+
margin-left: 10%;
|
| 28 |
+
margin-right: 10%;
|
| 29 |
+
padding-top: 2%;
|
| 30 |
+
padding-bottom: 2%;
|
| 31 |
+
background-color: #3C9E58;
|
| 32 |
+
z-index: 5;
|
| 33 |
+
display: block;
|
| 34 |
+
position: relative;
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
.clause:hover {
|
| 38 |
+
transform: scale(1.1);
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
.clause_text{
|
| 42 |
+
font-weight: bold;
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
.clause_meta{
|
| 46 |
+
text-align: center;
|
| 47 |
+
border-style: solid;
|
| 48 |
+
border-width: 1px;
|
| 49 |
+
border-radius: 5px;
|
| 50 |
+
border-color: #0c0c0e;
|
| 51 |
+
margin-left: 10%;
|
| 52 |
+
margin-right: 10%;
|
| 53 |
+
padding-top: 2%;
|
| 54 |
+
padding-bottom: 2%;
|
| 55 |
+
z-index: 3;
|
| 56 |
+
display: block;
|
| 57 |
+
position: relative;
|
| 58 |
+
}
|
support_functions.py
ADDED
|
@@ -0,0 +1,296 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import difflib
|
| 3 |
+
from spacy.tokens import Doc
|
| 4 |
+
|
| 5 |
+
import plotly
|
| 6 |
+
import plotly.graph_objs as go
|
| 7 |
+
from sklearn.manifold import TSNE
|
| 8 |
+
import numpy as np
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class HealthseaSearch:
|
| 12 |
+
def __init__(self, _health_aspects, _products, _conditions, _benefits):
|
| 13 |
+
self.health_aspects = _health_aspects
|
| 14 |
+
self.products = _products
|
| 15 |
+
self.conditions = _conditions
|
| 16 |
+
self.benefits = _benefits
|
| 17 |
+
|
| 18 |
+
def __call__(self, query):
|
| 19 |
+
return query
|
| 20 |
+
|
| 21 |
+
# Load product meta
|
| 22 |
+
def get_products(self, _aspect, n):
|
| 23 |
+
product_list = []
|
| 24 |
+
product_ids = {}
|
| 25 |
+
_n = n
|
| 26 |
+
_aspect = _aspect.replace(" ", "_")
|
| 27 |
+
if _aspect in self.health_aspects:
|
| 28 |
+
aspect = self.health_aspects[_aspect]
|
| 29 |
+
else:
|
| 30 |
+
_aspect = difflib.get_close_matches("_aspect", self.health_aspects.keys())[
|
| 31 |
+
0
|
| 32 |
+
]
|
| 33 |
+
aspect = self.health_aspects[_aspect]
|
| 34 |
+
|
| 35 |
+
product_scoring = aspect["products"]
|
| 36 |
+
if n != 0:
|
| 37 |
+
if n > len(product_scoring):
|
| 38 |
+
n = len(product_scoring)
|
| 39 |
+
product_scoring = aspect["products"][:n]
|
| 40 |
+
|
| 41 |
+
for product in product_scoring:
|
| 42 |
+
if product[1] not in product_ids:
|
| 43 |
+
product_list.append((product[0], self.products[product[1]], _aspect))
|
| 44 |
+
product_ids[product[1]] = 1
|
| 45 |
+
|
| 46 |
+
for alias in aspect["alias"]:
|
| 47 |
+
n = _n
|
| 48 |
+
_product_scoring = self.health_aspects[alias]["products"]
|
| 49 |
+
if n != 0:
|
| 50 |
+
if n > len(_product_scoring):
|
| 51 |
+
n = len(_product_scoring)
|
| 52 |
+
_product_scoring = self.health_aspects[alias]["products"][:n]
|
| 53 |
+
|
| 54 |
+
for product in _product_scoring:
|
| 55 |
+
if product[1] not in product_ids:
|
| 56 |
+
product_list.append((product[0], self.products[product[1]], alias))
|
| 57 |
+
product_ids[product[1]] = 1
|
| 58 |
+
|
| 59 |
+
n = _n
|
| 60 |
+
if len(product_list) > n and n != 0:
|
| 61 |
+
product_list = product_list[:n]
|
| 62 |
+
product_list = sorted(product_list, key=lambda tup: tup[0], reverse=True)
|
| 63 |
+
|
| 64 |
+
return product_list
|
| 65 |
+
|
| 66 |
+
# Load product meta and return as DataFrame
|
| 67 |
+
def get_products_df(self, _aspect, n):
|
| 68 |
+
product_list = self.get_products(_aspect, n)
|
| 69 |
+
product_data = {
|
| 70 |
+
"product": [],
|
| 71 |
+
"score": [],
|
| 72 |
+
"health_aspect": [],
|
| 73 |
+
"rating": [],
|
| 74 |
+
"reviews": [],
|
| 75 |
+
}
|
| 76 |
+
for product in product_list:
|
| 77 |
+
product_data["score"].append(product[0])
|
| 78 |
+
product_data["product"].append(product[1]["name"])
|
| 79 |
+
product_data["health_aspect"].append(product[2])
|
| 80 |
+
product_data["rating"].append(product[1]["rating"])
|
| 81 |
+
product_data["reviews"].append(product[1]["review_count"])
|
| 82 |
+
|
| 83 |
+
datatypes = {
|
| 84 |
+
"product": str,
|
| 85 |
+
"score": int,
|
| 86 |
+
"health_aspect": str,
|
| 87 |
+
"rating": str,
|
| 88 |
+
"reviews": int,
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
df = pd.DataFrame(data=product_data)
|
| 92 |
+
df = df.astype(datatypes)
|
| 93 |
+
|
| 94 |
+
return df
|
| 95 |
+
|
| 96 |
+
# Get health aspect
|
| 97 |
+
def get_aspect(self, _aspect):
|
| 98 |
+
_aspect = _aspect.replace(" ", "_")
|
| 99 |
+
if _aspect in self.health_aspects:
|
| 100 |
+
return self.health_aspects[_aspect]
|
| 101 |
+
else:
|
| 102 |
+
_aspect = difflib.get_close_matches("_aspect", self.health_aspects.keys())[
|
| 103 |
+
0
|
| 104 |
+
]
|
| 105 |
+
return self.health_aspects[_aspect]
|
| 106 |
+
|
| 107 |
+
# Get health aspect meta
|
| 108 |
+
def get_aspect_meta(self, _aspect):
|
| 109 |
+
_aspect = _aspect.replace(" ", "_")
|
| 110 |
+
if _aspect in self.conditions:
|
| 111 |
+
return self.conditions[_aspect]
|
| 112 |
+
elif _aspect in self.benefits:
|
| 113 |
+
return self.benefits[_aspect]
|
| 114 |
+
else:
|
| 115 |
+
_aspect = difflib.get_close_matches("_aspect", self.conditions.keys())[0]
|
| 116 |
+
return self.conditions[_aspect]
|
| 117 |
+
|
| 118 |
+
# Plotting vectors (2D/3D)
|
| 119 |
+
def tsne_plot(self, dataset):
|
| 120 |
+
"Creates and TSNE model and plots it"
|
| 121 |
+
labels = []
|
| 122 |
+
tokens = []
|
| 123 |
+
|
| 124 |
+
for i in dataset:
|
| 125 |
+
tokens.append(np.array(i[1]))
|
| 126 |
+
labels.append(i[0])
|
| 127 |
+
|
| 128 |
+
if len(dataset) > 2:
|
| 129 |
+
tsne_model = TSNE(
|
| 130 |
+
perplexity=40, n_components=3, init="pca", n_iter=2500, random_state=23
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
new_values = tsne_model.fit_transform(tokens)
|
| 134 |
+
|
| 135 |
+
x = []
|
| 136 |
+
y = []
|
| 137 |
+
z = []
|
| 138 |
+
for value in new_values:
|
| 139 |
+
x.append(value[0])
|
| 140 |
+
y.append(value[1])
|
| 141 |
+
z.append(value[2])
|
| 142 |
+
|
| 143 |
+
trace = go.Scatter3d(
|
| 144 |
+
x=x,
|
| 145 |
+
y=y,
|
| 146 |
+
z=z,
|
| 147 |
+
text=labels,
|
| 148 |
+
textposition="top right",
|
| 149 |
+
mode="lines+markers+text",
|
| 150 |
+
marker={
|
| 151 |
+
"size": 10,
|
| 152 |
+
"opacity": 0.8,
|
| 153 |
+
},
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
# Configure the layout.
|
| 157 |
+
layout = go.Layout(
|
| 158 |
+
margin={"l": 0, "r": 0, "b": 0, "t": 0}, font={"color": "#DF55E2"}
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
data = [trace]
|
| 162 |
+
|
| 163 |
+
return go.Figure(data=data, layout=layout)
|
| 164 |
+
|
| 165 |
+
else:
|
| 166 |
+
tsne_model = TSNE(
|
| 167 |
+
perplexity=40, n_components=2, init="pca", n_iter=2500, random_state=23
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
new_values = tsne_model.fit_transform(tokens)
|
| 171 |
+
|
| 172 |
+
x = []
|
| 173 |
+
y = []
|
| 174 |
+
for value in new_values:
|
| 175 |
+
x.append(value[0])
|
| 176 |
+
y.append(value[1])
|
| 177 |
+
|
| 178 |
+
trace = go.Scatter(
|
| 179 |
+
x=x,
|
| 180 |
+
y=y,
|
| 181 |
+
text=labels,
|
| 182 |
+
textposition="top right",
|
| 183 |
+
mode="lines+markers+text",
|
| 184 |
+
marker={
|
| 185 |
+
"size": 10,
|
| 186 |
+
"opacity": 0.8,
|
| 187 |
+
},
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
# Configure the layout.
|
| 191 |
+
layout = go.Layout(
|
| 192 |
+
margin={"l": 0, "r": 0, "b": 0, "t": 0}, font={"color": "#DF55E2"}
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
data = [trace]
|
| 196 |
+
|
| 197 |
+
return go.Figure(data=data, layout=layout)
|
| 198 |
+
|
| 199 |
+
# Load substance meta
|
| 200 |
+
def get_substances(self, _aspect, n):
|
| 201 |
+
substance_list = []
|
| 202 |
+
substance_ids = {}
|
| 203 |
+
exclude = ["sodium", "sugar", "sugar_alcohol"]
|
| 204 |
+
_n = n
|
| 205 |
+
_aspect = _aspect.replace(" ", "_")
|
| 206 |
+
if _aspect in self.health_aspects:
|
| 207 |
+
aspect = self.health_aspects[_aspect]
|
| 208 |
+
else:
|
| 209 |
+
_aspect = difflib.get_close_matches("_aspect", self.health_aspects.keys())[
|
| 210 |
+
0
|
| 211 |
+
]
|
| 212 |
+
aspect = self.health_aspects[_aspect]
|
| 213 |
+
|
| 214 |
+
substance_scoring = aspect["substance"]
|
| 215 |
+
if n != 0:
|
| 216 |
+
if n > len(substance_scoring):
|
| 217 |
+
n = len(substance_scoring)
|
| 218 |
+
substance_scoring = aspect["substance"][:n]
|
| 219 |
+
|
| 220 |
+
for substance in substance_scoring:
|
| 221 |
+
if substance[1] in exclude:
|
| 222 |
+
continue
|
| 223 |
+
if substance[1] not in substance_ids:
|
| 224 |
+
substance_list.append((substance[0], substance[1], _aspect))
|
| 225 |
+
substance_ids[substance[1]] = 1
|
| 226 |
+
|
| 227 |
+
for alias in aspect["alias"]:
|
| 228 |
+
n = _n
|
| 229 |
+
_substance_scoring = self.health_aspects[alias]["substance"]
|
| 230 |
+
if n != 0:
|
| 231 |
+
if n > len(_substance_scoring):
|
| 232 |
+
n = len(_substance_scoring)
|
| 233 |
+
_substance_scoring = self.health_aspects[alias]["substance"][:n]
|
| 234 |
+
|
| 235 |
+
for substance in _substance_scoring:
|
| 236 |
+
if substance[1] in exclude:
|
| 237 |
+
continue
|
| 238 |
+
if substance[1] not in substance_ids:
|
| 239 |
+
substance_list.append((substance[0], substance[1], alias))
|
| 240 |
+
substance_ids[substance[1]] = 1
|
| 241 |
+
|
| 242 |
+
n = _n
|
| 243 |
+
if len(substance_list) > n and n != 0:
|
| 244 |
+
substance_list = substance_list[:n]
|
| 245 |
+
substance_list = sorted(substance_list, key=lambda tup: tup[0], reverse=True)
|
| 246 |
+
|
| 247 |
+
return substance_list
|
| 248 |
+
|
| 249 |
+
# Load substance meta and return as DataFrame
|
| 250 |
+
def get_substances_df(self, _aspect, n):
|
| 251 |
+
substance_list = self.get_substances(_aspect, n)
|
| 252 |
+
substance_data = {"substance": [], "score": [], "health_aspect": []}
|
| 253 |
+
for substance in substance_list:
|
| 254 |
+
substance_data["score"].append(substance[0])
|
| 255 |
+
substance_data["substance"].append(substance[1])
|
| 256 |
+
substance_data["health_aspect"].append(substance[2])
|
| 257 |
+
|
| 258 |
+
datatypes = {"substance": str, "score": int, "health_aspect": str}
|
| 259 |
+
|
| 260 |
+
df = pd.DataFrame(data=substance_data)
|
| 261 |
+
df = df.astype(datatypes)
|
| 262 |
+
|
| 263 |
+
return df
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
class HealthseaPipe:
|
| 267 |
+
|
| 268 |
+
# Get Clauses and their predictions
|
| 269 |
+
def get_clauses(self, doc):
|
| 270 |
+
clauses = []
|
| 271 |
+
for clause in doc._.clauses:
|
| 272 |
+
words = []
|
| 273 |
+
spaces = []
|
| 274 |
+
clause_slice = doc[clause["split_indices"][0] : clause["split_indices"][1]]
|
| 275 |
+
|
| 276 |
+
if clause["has_ent"]:
|
| 277 |
+
for token in clause_slice:
|
| 278 |
+
if token.i == clause["ent_indices"][0]:
|
| 279 |
+
words.append(
|
| 280 |
+
clause["blinder"].replace(">", "").replace("<", "")
|
| 281 |
+
)
|
| 282 |
+
spaces.append(True)
|
| 283 |
+
elif token.i not in range(
|
| 284 |
+
clause["ent_indices"][0], clause["ent_indices"][1]
|
| 285 |
+
):
|
| 286 |
+
words.append(token.text)
|
| 287 |
+
spaces.append(token.whitespace_)
|
| 288 |
+
clauses.append(Doc(doc.vocab, words=words, spaces=spaces))
|
| 289 |
+
|
| 290 |
+
else:
|
| 291 |
+
for token in clause_slice:
|
| 292 |
+
words.append(token.text)
|
| 293 |
+
spaces.append(token.whitespace_)
|
| 294 |
+
clauses.append(Doc(doc.vocab, words=words, spaces=spaces))
|
| 295 |
+
|
| 296 |
+
return clauses
|
visualize_dataset.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
import json
|
| 4 |
+
from support_functions import HealthseaSearch
|
| 5 |
+
|
| 6 |
+
def visualize_dataset():
|
| 7 |
+
# Configuration
|
| 8 |
+
health_aspect_path = Path("data/health_aspects.json")
|
| 9 |
+
product_path = Path("data/products.json")
|
| 10 |
+
condition_path = Path("data/condition_vectors.json")
|
| 11 |
+
benefit_path = Path("data/benefit_vectors.json")
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# Load data
|
| 15 |
+
@st.cache(allow_output_mutation=True)
|
| 16 |
+
def load_data(
|
| 17 |
+
_health_aspect_path: Path,
|
| 18 |
+
_product_path: Path,
|
| 19 |
+
_condition_path: Path,
|
| 20 |
+
_benefit_path: Path,
|
| 21 |
+
):
|
| 22 |
+
with open(_health_aspect_path) as reader:
|
| 23 |
+
health_aspects = json.load(reader)
|
| 24 |
+
with open(_product_path) as reader:
|
| 25 |
+
products = json.load(reader)
|
| 26 |
+
with open(_condition_path) as reader:
|
| 27 |
+
conditions = json.load(reader)
|
| 28 |
+
with open(_benefit_path) as reader:
|
| 29 |
+
benefits = json.load(reader)
|
| 30 |
+
return health_aspects, products, conditions, benefits
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
# Functions
|
| 34 |
+
def kpi(n, text):
|
| 35 |
+
html = f"""
|
| 36 |
+
<div class='kpi'>
|
| 37 |
+
<h1 class='kpi_header'>{n}</h1>
|
| 38 |
+
<span>{text}</span>
|
| 39 |
+
</div>
|
| 40 |
+
"""
|
| 41 |
+
return html
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def central_text(text):
|
| 45 |
+
html = f"""<h2 class='central_text'>{text}</h2>"""
|
| 46 |
+
return html
|
| 47 |
+
|
| 48 |
+
# Loading data
|
| 49 |
+
health_aspects, products, conditions, benefits = load_data(
|
| 50 |
+
health_aspect_path, product_path, condition_path, benefit_path
|
| 51 |
+
)
|
| 52 |
+
search_engine = HealthseaSearch(health_aspects, products, conditions, benefits)
|
| 53 |
+
|
| 54 |
+
# KPI
|
| 55 |
+
|
| 56 |
+
st.markdown("""---""")
|
| 57 |
+
|
| 58 |
+
st.markdown(central_text("🎀 Dataset"), unsafe_allow_html=True)
|
| 59 |
+
|
| 60 |
+
kpi_products, kpi_reviews, kpi_condition, kpi_benefit = st.columns(4)
|
| 61 |
+
|
| 62 |
+
kpi_products.markdown(kpi(len(products), "Products"), unsafe_allow_html=True)
|
| 63 |
+
kpi_reviews.markdown(kpi(933.240, "Reviews"), unsafe_allow_html=True)
|
| 64 |
+
kpi_condition.markdown(kpi(len(conditions), "Conditions"), unsafe_allow_html=True)
|
| 65 |
+
kpi_benefit.markdown(kpi(len(benefits), "Benefits"), unsafe_allow_html=True)
|
| 66 |
+
|
| 67 |
+
st.markdown("""---""")
|
| 68 |
+
|
| 69 |
+
# Search
|
| 70 |
+
search = st.text_input(label="Search for an health aspect", value="joint pain")
|
| 71 |
+
n = st.slider("Show top n results", min_value=10, max_value=1000, value=25)
|
| 72 |
+
|
| 73 |
+
st.markdown("""---""")
|
| 74 |
+
st.markdown(central_text("🧃 Products"), unsafe_allow_html=True)
|
| 75 |
+
|
| 76 |
+
# DataFrame
|
| 77 |
+
st.write(search_engine.get_products_df(search, n))
|
| 78 |
+
|
| 79 |
+
# KPI & Alias
|
| 80 |
+
aspect_alias = search_engine.get_aspect(search)["alias"]
|
| 81 |
+
|
| 82 |
+
if len(aspect_alias) > 0:
|
| 83 |
+
kpi_mentions, kpi_product_mentions, kpi_alias = st.columns(3)
|
| 84 |
+
kpi_mentions.markdown(
|
| 85 |
+
kpi(search_engine.get_aspect_meta(search)["frequency"], "Mentions"),
|
| 86 |
+
unsafe_allow_html=True,
|
| 87 |
+
)
|
| 88 |
+
kpi_product_mentions.markdown(
|
| 89 |
+
kpi(len(search_engine.get_aspect(search)["products"]), "Products"),
|
| 90 |
+
unsafe_allow_html=True,
|
| 91 |
+
)
|
| 92 |
+
kpi_alias.markdown(
|
| 93 |
+
kpi(len(aspect_alias), "Similar health aspects"),
|
| 94 |
+
unsafe_allow_html=True,
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
vectors = []
|
| 98 |
+
main_aspect = search_engine.get_aspect_meta(search)
|
| 99 |
+
vectors.append((main_aspect["name"], main_aspect["vector"]))
|
| 100 |
+
for aspect in aspect_alias:
|
| 101 |
+
current_aspect = search_engine.get_aspect_meta(aspect)
|
| 102 |
+
vectors.append((current_aspect["name"], current_aspect["vector"]))
|
| 103 |
+
st.markdown("\n")
|
| 104 |
+
st.write(search_engine.tsne_plot(vectors))
|
| 105 |
+
|
| 106 |
+
else:
|
| 107 |
+
kpi_mentions, kpi_product_mentions = st.columns(2)
|
| 108 |
+
kpi_mentions.markdown(
|
| 109 |
+
kpi(search_engine.get_aspect_meta(search)["frequency"], "Mentions"),
|
| 110 |
+
unsafe_allow_html=True,
|
| 111 |
+
)
|
| 112 |
+
kpi_product_mentions.markdown(
|
| 113 |
+
kpi(len(search_engine.get_aspect(search)["products"]), "Products"),
|
| 114 |
+
unsafe_allow_html=True,
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
st.markdown("""---""")
|
| 118 |
+
|
| 119 |
+
# Substances
|
| 120 |
+
st.markdown(central_text("🍯 Substances"), unsafe_allow_html=True)
|
| 121 |
+
|
| 122 |
+
# DataFrame
|
| 123 |
+
st.write(search_engine.get_substances_df(search, n))
|
| 124 |
+
kpi_tmp, kpi_substances = st.columns(2)
|
| 125 |
+
kpi_substances.markdown(
|
| 126 |
+
kpi(len(search_engine.get_aspect(search)["substance"]), "Substances"),
|
| 127 |
+
unsafe_allow_html=True,
|
| 128 |
+
)
|
visualize_pipeline.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import spacy
|
| 3 |
+
from spacy_streamlit import visualize_ner
|
| 4 |
+
from support_functions import HealthseaPipe
|
| 5 |
+
import operator
|
| 6 |
+
|
| 7 |
+
def visualize_pipeline():
|
| 8 |
+
healthsea_pipe = HealthseaPipe()
|
| 9 |
+
|
| 10 |
+
color_code = {
|
| 11 |
+
"POSITIVE": ("#3C9E58", "#1B7735"),
|
| 12 |
+
"NEGATIVE": ("#FF166A", "#C0094B"),
|
| 13 |
+
"NEUTRAL": ("#7E7E7E", "#4E4747"),
|
| 14 |
+
"ANAMNESIS": ("#E49A55", "#AD6B2D"),
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
example_reviews = [
|
| 18 |
+
"This is great for joint pain.",
|
| 19 |
+
"This help joint pain but causes rashes",
|
| 20 |
+
"I'm diagnosed with gastritis. This product helped!",
|
| 21 |
+
"Made my insomnia worse",
|
| 22 |
+
"Didn't help my energy levels",
|
| 23 |
+
]
|
| 24 |
+
|
| 25 |
+
# Functions
|
| 26 |
+
def kpi(n, text):
|
| 27 |
+
html = f"""
|
| 28 |
+
<div class='kpi'>
|
| 29 |
+
<h1>{n}</h1>
|
| 30 |
+
<span>{text}</span>
|
| 31 |
+
</div>
|
| 32 |
+
"""
|
| 33 |
+
return html
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def central_text(text):
|
| 37 |
+
html = f"""<h2 class='central_text'>{text}</h2>"""
|
| 38 |
+
return html
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def format_clause(text, meta, pred):
|
| 42 |
+
html = f"""
|
| 43 |
+
<div>
|
| 44 |
+
<div class="clause" style="background-color:{color_code[pred][0]} ; box-shadow: 0px 5px {color_code[pred][1]}; border-color:{color_code[pred][1]};">
|
| 45 |
+
<div class="clause_text">{text}</div>
|
| 46 |
+
</div>
|
| 47 |
+
<div class="clause_meta">
|
| 48 |
+
<div>{meta}</div>
|
| 49 |
+
</div>
|
| 50 |
+
</div>"""
|
| 51 |
+
return html
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def format_effect(text, pred):
|
| 55 |
+
html = f"""
|
| 56 |
+
<div>
|
| 57 |
+
<div class="clause" style="background-color:{color_code[pred][0]} ; box-shadow: 0px 5px {color_code[pred][1]}; border-color:{color_code[pred][1]};">
|
| 58 |
+
<div class="clause_text">{text}</div>
|
| 59 |
+
</div>
|
| 60 |
+
</div>"""
|
| 61 |
+
return html
|
| 62 |
+
|
| 63 |
+
# Load model
|
| 64 |
+
nlp = spacy.load("en_healthsea")
|
| 65 |
+
|
| 66 |
+
# Pipeline
|
| 67 |
+
st.markdown("""---""")
|
| 68 |
+
|
| 69 |
+
st.markdown(central_text("⚙️ Pipeline"), unsafe_allow_html=True)
|
| 70 |
+
|
| 71 |
+
check = st.checkbox("Use predefined examples")
|
| 72 |
+
|
| 73 |
+
if not check:
|
| 74 |
+
text = st.text_input(label="Write a review", value="This is great for joint pain!")
|
| 75 |
+
else:
|
| 76 |
+
text = st.selectbox("Predefined example reviews", example_reviews)
|
| 77 |
+
doc = nlp(text)
|
| 78 |
+
|
| 79 |
+
# NER
|
| 80 |
+
visualize_ner(
|
| 81 |
+
doc,
|
| 82 |
+
labels=nlp.get_pipe("ner").labels,
|
| 83 |
+
show_table=False,
|
| 84 |
+
title="✨ Named Entity Recognition",
|
| 85 |
+
colors={"CONDITION": "#FF4B76", "BENEFIT": "#629B68"},
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
st.markdown("""---""")
|
| 89 |
+
|
| 90 |
+
# Segmentation, Blinding, Classification
|
| 91 |
+
st.markdown("## 🔮 Segmentation, Blinding, Classification")
|
| 92 |
+
|
| 93 |
+
clauses = healthsea_pipe.get_clauses(doc)
|
| 94 |
+
for doc_clause, clause in zip(clauses, doc._.clauses):
|
| 95 |
+
classification = max(clause["cats"].items(), key=operator.itemgetter(1))[0]
|
| 96 |
+
percentage = round(float(clause["cats"][classification]) * 100, 2)
|
| 97 |
+
meta = f"{clause['ent_name']} ({classification} {percentage}%)"
|
| 98 |
+
|
| 99 |
+
st.markdown(
|
| 100 |
+
format_clause(doc_clause.text, meta, classification), unsafe_allow_html=True
|
| 101 |
+
)
|
| 102 |
+
st.markdown("\n")
|
| 103 |
+
|
| 104 |
+
st.markdown("""---""")
|
| 105 |
+
|
| 106 |
+
# Aggregation
|
| 107 |
+
st.markdown("## 🔗 Aggregation")
|
| 108 |
+
|
| 109 |
+
for effect in doc._.health_effects:
|
| 110 |
+
st.markdown(
|
| 111 |
+
format_effect(
|
| 112 |
+
f"{doc._.health_effects[effect]['effect']} effect on {effect}",
|
| 113 |
+
doc._.health_effects[effect]["effect"],
|
| 114 |
+
),
|
| 115 |
+
unsafe_allow_html=True,
|
| 116 |
+
)
|
| 117 |
+
st.markdown("\n")
|
| 118 |
+
|
| 119 |
+
st.markdown("""---""")
|
| 120 |
+
# Indepth
|
| 121 |
+
st.markdown("## 🔧 Pipeline attributes")
|
| 122 |
+
clauses_col, effect_col = st.columns(2)
|
| 123 |
+
|
| 124 |
+
clauses_col.markdown("### doc._.clauses")
|
| 125 |
+
for clause in doc._.clauses:
|
| 126 |
+
clauses_col.json(clause)
|
| 127 |
+
effect_col.markdown("### doc._.health_effects")
|
| 128 |
+
effect_col.json(doc._.health_effects)
|