davanstrien's picture
davanstrien HF Staff
Update app.py
aa1742a verified
raw
history blame
3.24 kB
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import spaces
model = AutoModelForCausalLM.from_pretrained("instruction-pretrain/instruction-synthesizer")
model.to('cuda')
tokenizer = AutoTokenizer.from_pretrained("instruction-pretrain/instruction-synthesizer")
def parse_pred(pred):
"""Extract the list of instruction-response pairs from the prediction"""
QA_str_list = pred.split('</END>')
if not pred.endswith('</END>'):
QA_str_list = QA_str_list[:-1]
QA_list = []
raw_questions = []
for QA_str in QA_str_list:
try:
assert len(QA_str.split('<ANS>')) == 2, f'invalid QA string: {QA_str}'
Q_str, A_str = QA_str.split('<ANS>')
Q_str, A_str = Q_str.strip(), A_str.strip()
assert Q_str.startswith('<QUE>'), f'invalid question string: {Q_str} in QA_str: {QA_str}'
assert len(A_str) > 0, f'invalid answer string in QA_str: {QA_str}'
Q_str = Q_str.replace('<QUE>', '').strip()
assert Q_str.lower() not in raw_questions, f'duplicate question: {Q_str}'
QA_list.append({'Q': Q_str, 'A': A_str})
raw_questions.append(Q_str.lower())
except:
pass
return QA_list
def get_instruction_response_pairs(context):
'''Prompt the synthesizer to generate instruction-response pairs based on the given context'''
prompt = f'<s> <CON> {context} </CON>\n\n'
inputs = tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids.to(model.device)
outputs = model.generate(input_ids=inputs, max_new_tokens=400, do_sample=False)[0]
pred_start = int(inputs.shape[-1])
pred = tokenizer.decode(outputs[pred_start:], skip_special_tokens=True)
return parse_pred(pred)
@spaces.GPU
def generate_pairs(context):
instruction_response_pairs = get_instruction_response_pairs(context)
output = ""
for index, pair in enumerate(instruction_response_pairs):
output += f"## Instruction {index + 1}:\n{pair['Q']}\n## Response {index + 1}:\n{pair['A']}\n\n"
return output
description = """
## Instruction Pre-Training: Language Models as Supervised Multitask Learners
This demo implements the instruction synthesis approach from the paper ["Instruction Pre-Training: Language Models are Supervised Multitask Learners"](https://huggingface.co/papers/2406.14491).
### Method:
1. An instruction synthesizer is trained on diverse datasets to generate instruction-response pairs from raw text.
2. The synthesizer augments raw pre-training corpora with synthesized instruction-response pairs.
3. Language models are then pre-trained on this augmented data, combining unsupervised and supervised multitask learning.
This approach enhances model performance and generalization, particularly benefiting from further instruction tuning.
Try it out by entering some text below!
"""
# Create Gradio interface
iface = gr.Interface(
fn=generate_pairs,
inputs=gr.Textbox(lines=5, label="Enter context here"),
outputs=gr.Textbox(lines=20, label="Generated Instruction-Response Pairs"),
title="Instruction-Response Pair Generator",
description=description
)
# Launch the interface
iface.launch()