|
--- |
|
library_name: transformers |
|
license: apache-2.0 |
|
datasets: |
|
- sartajbhuvaji/gutenberg |
|
language: |
|
- en |
|
base_model: |
|
- google-bert/bert-base-uncased |
|
pipeline_tag: text-classification |
|
--- |
|
|
|
```python |
|
from transformers import EncoderDecoderModel, BertTokenizer |
|
from datasets import load_dataset |
|
from transformers import pipeline |
|
import torch |
|
|
|
# Load the encoder-decoder model and tokenizer from Hugging Face |
|
encoder_decoder_model = EncoderDecoderModel.from_pretrained('sartajbhuvaji/gutenberg-bert-encoder-decoder') |
|
tokenizer = BertTokenizer.from_pretrained("sartajbhuvaji/gutenberg-bert-encoder-decoder") |
|
|
|
# Define the number of labels for your classification task |
|
num_labels = 10 |
|
|
|
# Load the custom classification head |
|
classification_model = EncoderDecoderForClassification(encoder_decoder_model, num_labels) |
|
classification_model.load_state_dict(torch.load("gutenberg-classification-head.pth")) |
|
|
|
# Now create a text classification pipeline |
|
classifier = pipeline("text-classification", model=classification_model, tokenizer=tokenizer) |
|
|
|
# Test the pipeline with a single sentence |
|
result = classifier("This is a great book!") |
|
print(result) |
|
|
|
# Load sample dataset |
|
dataset = load_dataset("sartajbhuvaji/gutenberg", split="100") |
|
df = dataset.to_pandas() |
|
|
|
# Test the pipeline on a document from a DataFrame (assuming `df` is a pandas DataFrame with text data) |
|
doc_id = 1 |
|
doc_text = df.loc[df['DocID'] == doc_id, 'Text'].values[0] |
|
result = classifier(doc_text[:1024]) |
|
print(result) |
|
``` |