|
--- |
|
license: apache-2.0 |
|
language: |
|
- he |
|
datasets: |
|
- HeTree/MevakerConcSen |
|
--- |
|
## Hebrew Conclusion Extraction Model (based on sequence plus context classification) |
|
|
|
#### How to use |
|
|
|
```python |
|
from transformers import RobertaTokenizerFast, AutoModelForSequenceClassification |
|
from datasets import load_dataset, Dataset |
|
from functools import partial |
|
from tqdm.auto import tqdm |
|
tqdm._instances.clear() |
|
|
|
def tokenize_function(example): |
|
inputs = tokenizer( |
|
example["sentence"], |
|
example["context"], |
|
max_length=512, |
|
truncation=True, |
|
padding="max_length", |
|
) |
|
return inputs |
|
|
|
def create_windowed_context_ds(context_l, example, idx): |
|
example["context"] = context_l[idx] |
|
return example |
|
|
|
def create_windowed_context(raw_dataset, window_size): |
|
df_pandas = raw_dataset['train'].to_pandas() |
|
len1 = len(raw_dataset['train']) |
|
context_l = [] |
|
for i in tqdm(range(len1)): |
|
if i - window_size <0: |
|
context_l.append(' '.join(df_pandas['sentence'][0:window_size])) |
|
else: |
|
if i + window_size > len1 : |
|
context_l.append(' '.join(df_pandas['sentence'][i - window_size:-1])) |
|
else: |
|
context_l.append(' '.join(df_pandas['sentence'][i - window_size:i + window_size])) |
|
return context_l |
|
|
|
model = AutoModelForSequenceClassification.from_pretrained('HeTree/HeConEspc', num_labels=2) |
|
tokenizer = RobertaTokenizerFast.from_pretrained('HeTree/HeConEspc') |
|
raw_dataset = load_dataset('HeTree/MevakerConcSen') |
|
window_size = 5 |
|
context_l = create_windowed_context(raw_dataset, window_size) |
|
raw_dataset_window = raw_dataset.map(partial(create_windowed_context_ds, context_l), batched=False, with_indices=True) |
|
tokenized_data = raw_dataset_window.map(tokenize_function, batched=True) |
|
``` |
|
|
|
|
|
### Citing |
|
|
|
If you use HeConEspc in your research, please cite [HeRo: RoBERTa and Longformer Hebrew Language Models](http://arxiv.org/abs/2304.11077). |
|
``` |
|
@article{shalumov2024mevaker, |
|
title={Mevaker: Conclusion Extraction and Allocation Resources for the Hebrew Language}, |
|
author={Vitaly Shalumov and Harel Haskey and Yuval Solaz}, |
|
year={2024}, |
|
eprint={2403.09719}, |
|
archivePrefix={arXiv}, |
|
primaryClass={cs.CL} |
|
} |
|
``` |