Spaces:
Running
on
Zero
Running
on
Zero
File size: 2,942 Bytes
a64dda4 6a4b3a2 a64dda4 6a4b3a2 a64dda4 c6cda2e a64dda4 5e710c8 a64dda4 6a4b3a2 c6cda2e a64dda4 8b90459 bc256ab 2b62863 bc256ab c6cda2e a64dda4 bc256ab b25a67a c6cda2e a64dda4 c0e022a c6cda2e eb9eac3 a64dda4 eb9eac3 a64dda4 dc47567 a64dda4 36a1e14 c6cda2e 6a4b3a2 c6cda2e ff52d8e a64dda4 ff52d8e 9cf274d c6cda2e a64dda4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import json # For stringifying a dict
import gradio # GUI framework
import datasets # Used to load publication dataset
import numpy # For a few simple matrix operations
import pandas # Needed for operating on dataset
import sentence_transformers # Needed for query embedding
import faiss # Needed for fast similarity search
# Load the dataset and convert to pandas
full_data = datasets.load_dataset("ccm/publications")["train"].to_pandas()
# Define the base URL for Google Scholar
SCHOLAR_URL = "https://scholar.google.com"
# Filter out any publications without an abstract
filter = [
'"abstract": null' in json.dumps(bibdict)
for bibdict in full_data["bib_dict"].values
]
data = full_data[~pandas.Series(filter)]
data.reset_index(inplace=True)
# Create a FAISS index for fast similarity search
index = faiss.IndexFlatL2(len(data["embedding"][0]))
index.metric_type = faiss.METRIC_INNER_PRODUCT
vectors = numpy.stack(data["embedding"].tolist(), axis=0)
faiss.normalize_L2(vectors)
index.train(vectors)
index.add(vectors)
# Load the model for later use in embeddings
model = sentence_transformers.SentenceTransformer("allenai-specter")
# Define the search function
def search(query: str, k: int):
query = numpy.expand_dims(model.encode(query), axis=0)
faiss.normalize_L2(query)
D, I = index.search(query, k)
top_five = data.loc[I[0]]
search_results = ""
for i in range(k):
search_results += "### " + top_five["bib_dict"].values[i]["title"] + "\n\n"
search_results += str(D[0][i]) + "\n\n"
if top_five["pub_url"].values[i] is not None:
search_results += "[Full Text](" + top_five["pub_url"].values[i] + ") "
if top_five["citedby_url"].values[i] is not None:
search_results += (
"[Cited By](" + SCHOLAR_URL + top_five["citedby_url"].values[i] + ") "
)
if top_five["url_related_articles"].values[i] is not None:
search_results += (
"[Related Articles]("
+ SCHOLAR_URL
+ top_five["url_related_articles"].values[i]
+ ") "
)
search_results += "\n\n```bibtex\n"
search_results += (
json.dumps(top_five["bibtex"].values[i], indent=4)
.replace("\\n", "\n")
.replace("\\t", "\t")
.strip('"')
)
search_results += "```\n"
return search_results
with gradio.Blocks() as demo:
with gradio.Group():
query = gradio.Textbox(
placeholder="Enter search terms...", show_label=False, lines=1, max_lines=1
)
with gradio.Accordion("Settings", open=False):
k = gradio.Number(5.0, label="Number of results", precision=0)
results = gradio.Markdown()
query.change(fn=search, inputs=[query, k], outputs=results)
k.change(fn=search, inputs=[query, k], outputs=results)
demo.launch(debug=True)
|