Spaces:
Running
Running
File size: 481 Bytes
4e925af |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 |
import re
from datasets import load_dataset
def arxiv_remove_version_suffix(arxiv_id):
# Use regex to remove version suffix (e.g., v1, v2, etc.) if present
cleaned_id = re.sub(r'v\d+$', '', arxiv_id)
return cleaned_id
# Load datasets
def load_and_process(dataset_name):
data = load_dataset(dataset_name, split="train").to_pandas()
if 'arxiv_id' in data.columns:
data['arxiv_id'] = data['arxiv_id'].apply(arxiv_remove_version_suffix)
return data
|