Chenxi Whitehouse
commited on
Commit
·
17da9c4
1
Parent(s):
2b35800
update file
Browse files
README.md
CHANGED
|
@@ -41,5 +41,5 @@ python -m src.reranking.bm25_sentences
|
|
| 41 |
### Generate questions for each evidence sentence
|
| 42 |
We use [BLOOM](https://huggingface.co/bigscience/bloom-7b1) to generate questions for each evidence sentence using the closet examples from the training set. See [question_generation_top_sentences.py](https://huggingface.co/chenxwh/AVeriTeC/blob/main/src/reranking/question_generation_top_sentences.py) for more argument options.
|
| 43 |
```
|
| 44 |
-
python -m
|
| 45 |
```
|
|
|
|
| 41 |
### Generate questions for each evidence sentence
|
| 42 |
We use [BLOOM](https://huggingface.co/bigscience/bloom-7b1) to generate questions for each evidence sentence using the closet examples from the training set. See [question_generation_top_sentences.py](https://huggingface.co/chenxwh/AVeriTeC/blob/main/src/reranking/question_generation_top_sentences.py) for more argument options.
|
| 43 |
```
|
| 44 |
+
python -m src.reranking.question_generation_top_sentences
|
| 45 |
```
|
src/reranking/bm25_sentences.py
CHANGED
|
@@ -115,3 +115,4 @@ if __name__ == "__main__":
|
|
| 115 |
}
|
| 116 |
output_json.write(json.dumps(json_data, ensure_ascii=False) + "\n")
|
| 117 |
done += 1
|
|
|
|
|
|
| 115 |
}
|
| 116 |
output_json.write(json.dumps(json_data, ensure_ascii=False) + "\n")
|
| 117 |
done += 1
|
| 118 |
+
output_file.flush()
|
src/reranking/question_generation_top_sentences.py
CHANGED
|
@@ -113,16 +113,12 @@ if __name__ == "__main__":
|
|
| 113 |
prompt_lookup_str = sentences_urls["sentence"]
|
| 114 |
url = sentences_urls["url"]
|
| 115 |
|
| 116 |
-
st = time.time()
|
| 117 |
prompt_s = prompt_bm25.get_scores(
|
| 118 |
nltk.word_tokenize(prompt_lookup_str)
|
| 119 |
)
|
| 120 |
prompt_n = 10
|
| 121 |
prompt_top_n = np.argsort(prompt_s)[::-1][:prompt_n]
|
| 122 |
prompt_docs = [prompt_corpus[i] for i in prompt_top_n]
|
| 123 |
-
print(
|
| 124 |
-
f"Got top 100 prompt for sent {sent_i} in file {i}. Time elapsed: {time.time() - st}"
|
| 125 |
-
)
|
| 126 |
|
| 127 |
claim_prompt = (
|
| 128 |
"Evidence: "
|
|
@@ -135,7 +131,7 @@ if __name__ == "__main__":
|
|
| 135 |
inputs = tokenizer([prompt], padding=True, return_tensors="pt").to(
|
| 136 |
model.device
|
| 137 |
)
|
| 138 |
-
|
| 139 |
outputs = model.generate(
|
| 140 |
inputs["input_ids"],
|
| 141 |
max_length=5000,
|
|
@@ -143,6 +139,9 @@ if __name__ == "__main__":
|
|
| 143 |
no_repeat_ngram_size=2,
|
| 144 |
early_stopping=True,
|
| 145 |
)
|
|
|
|
|
|
|
|
|
|
| 146 |
|
| 147 |
tgt_text = tokenizer.batch_decode(
|
| 148 |
outputs[:, inputs["input_ids"].shape[-1] :],
|
|
@@ -165,7 +164,5 @@ if __name__ == "__main__":
|
|
| 165 |
"claim": claim,
|
| 166 |
"bm25_qau": bm25_qau,
|
| 167 |
}
|
| 168 |
-
output_file.write(
|
| 169 |
-
json.dumps(json_data, ensure_ascii=False, indent=4) + "\n"
|
| 170 |
-
)
|
| 171 |
output_file.flush()
|
|
|
|
| 113 |
prompt_lookup_str = sentences_urls["sentence"]
|
| 114 |
url = sentences_urls["url"]
|
| 115 |
|
|
|
|
| 116 |
prompt_s = prompt_bm25.get_scores(
|
| 117 |
nltk.word_tokenize(prompt_lookup_str)
|
| 118 |
)
|
| 119 |
prompt_n = 10
|
| 120 |
prompt_top_n = np.argsort(prompt_s)[::-1][:prompt_n]
|
| 121 |
prompt_docs = [prompt_corpus[i] for i in prompt_top_n]
|
|
|
|
|
|
|
|
|
|
| 122 |
|
| 123 |
claim_prompt = (
|
| 124 |
"Evidence: "
|
|
|
|
| 131 |
inputs = tokenizer([prompt], padding=True, return_tensors="pt").to(
|
| 132 |
model.device
|
| 133 |
)
|
| 134 |
+
st = time.time()
|
| 135 |
outputs = model.generate(
|
| 136 |
inputs["input_ids"],
|
| 137 |
max_length=5000,
|
|
|
|
| 139 |
no_repeat_ngram_size=2,
|
| 140 |
early_stopping=True,
|
| 141 |
)
|
| 142 |
+
print(
|
| 143 |
+
f"Generated QA for sent {sent_i} in file {i}. Time elapsed: {time.time() - st}"
|
| 144 |
+
)
|
| 145 |
|
| 146 |
tgt_text = tokenizer.batch_decode(
|
| 147 |
outputs[:, inputs["input_ids"].shape[-1] :],
|
|
|
|
| 164 |
"claim": claim,
|
| 165 |
"bm25_qau": bm25_qau,
|
| 166 |
}
|
| 167 |
+
output_file.write(json.dumps(json_data, ensure_ascii=False) + "\n")
|
|
|
|
|
|
|
| 168 |
output_file.flush()
|