Spaces:
Running
Running
Commit
·
2195005
1
Parent(s):
aa7da7f
Sync changes
Browse files
app.py
CHANGED
@@ -40,23 +40,19 @@ def format_tokenization_info(result):
|
|
40 |
return ''
|
41 |
token_ids = result['token_ids']
|
42 |
tokens = result['tokens']
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
tt = '\n'.join(tt)
|
57 |
-
ttt.append(tt)
|
58 |
-
ttt = '\n\n'.join(ttt)
|
59 |
-
return ttt
|
60 |
def format_doc(doc):
|
61 |
formatted = []
|
62 |
if doc['doc_len'] == doc['disp_len']:
|
@@ -134,7 +130,7 @@ def infgram_ntd(corpus_desc, engine_desc, query, request: gr.Request):
|
|
134 |
def search_docs(corpus_desc, engine_desc, query, maxnum, request: gr.Request):
|
135 |
result = process('search_docs', corpus_desc, engine_desc, query, maxnum, request)
|
136 |
latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
|
137 |
-
tokenization_info =
|
138 |
if 'error' in result:
|
139 |
message = result['error']
|
140 |
docs = [[] for _ in range(10)]
|
@@ -157,7 +153,7 @@ with gr.Blocks() as demo:
|
|
157 |
'''<h1 text-align="center">Infini-gram: An Engine for n-gram / ∞-gram Language Modeling with Trillion-Token Corpora</h1>
|
158 |
|
159 |
<p style='font-size: 16px;'>This is an engine that processes n-gram / ∞-gram queries on massive text corpora. Please first select the corpus and the type of query, then enter your query and submit.</p>
|
160 |
-
<p style='font-size: 16px;'>The engine is developed by <a href="https://liujch1998.github.io">Jiacheng (Gary) Liu</a> and documented in our paper: <a href="https://huggingface.co/papers/2401.17377">Infini-gram: Scaling Unbounded n-gram Language Models to a Trillion Tokens</a>.</p>
|
161 |
<p style='font-size: 16px;'><b>API Endpoint:</b> If you'd like to issue batch queries to infini-gram, you may invoke our API endpoint. Please refer to the <a href="https://infini-gram.io/api_doc">API documentation</a>.</p>
|
162 |
<p style='font-size: 16px;'><b>Note:</b> The query is <b>case-sensitive</b>. Your query will be tokenized with the Llama-2 tokenizer (unless otherwise specified).</p>
|
163 |
'''
|
|
|
40 |
return ''
|
41 |
token_ids = result['token_ids']
|
42 |
tokens = result['tokens']
|
43 |
+
if type(token_ids) == list and all([type(token_id) == int for token_id in token_ids]):
|
44 |
+
output = '[' + " ".join(['"' + token.replace('Ġ', ' ') + '"' for token in tokens]) + '] ' + str(token_ids)
|
45 |
+
else:
|
46 |
+
ttt = []
|
47 |
+
for token_idss, tokenss in zip(token_ids, tokens):
|
48 |
+
tt = []
|
49 |
+
for token_ids, tokens in zip(token_idss, tokenss):
|
50 |
+
t = '[' + " ".join(['"' + token.replace('Ġ', ' ') + '"' for token in tokens]) + '] ' + str(token_ids)
|
51 |
+
tt.append(t)
|
52 |
+
tt = '\n'.join(tt)
|
53 |
+
ttt.append(tt)
|
54 |
+
output = '\n\n'.join(ttt)
|
55 |
+
return output
|
|
|
|
|
|
|
|
|
56 |
def format_doc(doc):
|
57 |
formatted = []
|
58 |
if doc['doc_len'] == doc['disp_len']:
|
|
|
130 |
def search_docs(corpus_desc, engine_desc, query, maxnum, request: gr.Request):
|
131 |
result = process('search_docs', corpus_desc, engine_desc, query, maxnum, request)
|
132 |
latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
|
133 |
+
tokenization_info = format_tokenization_info(result)
|
134 |
if 'error' in result:
|
135 |
message = result['error']
|
136 |
docs = [[] for _ in range(10)]
|
|
|
153 |
'''<h1 text-align="center">Infini-gram: An Engine for n-gram / ∞-gram Language Modeling with Trillion-Token Corpora</h1>
|
154 |
|
155 |
<p style='font-size: 16px;'>This is an engine that processes n-gram / ∞-gram queries on massive text corpora. Please first select the corpus and the type of query, then enter your query and submit.</p>
|
156 |
+
<p style='font-size: 16px;'>The engine is developed by <a href="https://liujch1998.github.io">Jiacheng (Gary) Liu</a> and documented in our paper: <a href="https://huggingface.co/papers/2401.17377">Infini-gram: Scaling Unbounded n-gram Language Models to a Trillion Tokens</a>. Feel free to check out our <a href="https://infini-gram.io">Project Homepage</a>.</p>
|
157 |
<p style='font-size: 16px;'><b>API Endpoint:</b> If you'd like to issue batch queries to infini-gram, you may invoke our API endpoint. Please refer to the <a href="https://infini-gram.io/api_doc">API documentation</a>.</p>
|
158 |
<p style='font-size: 16px;'><b>Note:</b> The query is <b>case-sensitive</b>. Your query will be tokenized with the Llama-2 tokenizer (unless otherwise specified).</p>
|
159 |
'''
|