Spaces:

exbert-project
/

exbert

Running on CPU Upgrade

App Files Files Community

exbert / server /transformers /examples /hans /utils_hans.py

bhoov

git subrepo clone (merge) --branch=exbert-mods https://github.com/bhoov/transformers.git server/transformers

75466df over 5 years ago

raw

history blame

4.34 kB

	# coding=utf-8
	# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
	# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import copy
	import csv
	import json


	class InputExample(object):
	"""
	A single training/test example for simple sequence classification.

	Args:
	guid: Unique id for the example.
	text_a: string. The untokenized text of the first sequence. For single
	sequence tasks, only this sequence must be specified.
	text_b: (Optional) string. The untokenized text of the second sequence.
	Only must be specified for sequence pair tasks.
	label: (Optional) string. The label of the example. This should be
	specified for train and dev examples, but not for test examples.
	"""

	def __init__(self, guid, text_a, text_b=None, label=None, pairID=None):
	self.guid = guid
	self.text_a = text_a
	self.text_b = text_b
	self.label = label
	self.pairID = pairID

	def __repr__(self):
	return str(self.to_json_string())

	def to_dict(self):
	"""Serializes this instance to a Python dictionary."""
	output = copy.deepcopy(self.__dict__)
	return output

	def to_json_string(self):
	"""Serializes this instance to a JSON string."""
	return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"


	class InputFeatures(object):
	"""
	A single set of features of data.

	Args:
	input_ids: Indices of input sequence tokens in the vocabulary.
	attention_mask: Mask to avoid performing attention on padding token indices.
	Mask values selected in ``[0, 1]``:
	Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens.
	token_type_ids: Segment token indices to indicate first and second portions of the inputs.
	label: Label corresponding to the input
	"""

	def __init__(self, input_ids, attention_mask, token_type_ids, label, pairID=None):
	self.input_ids = input_ids
	self.attention_mask = attention_mask
	self.token_type_ids = token_type_ids
	self.label = label
	self.pairID = pairID

	def __repr__(self):
	return str(self.to_json_string())

	def to_dict(self):
	"""Serializes this instance to a Python dictionary."""
	output = copy.deepcopy(self.__dict__)
	return output

	def to_json_string(self):
	"""Serializes this instance to a JSON string."""
	return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"


	class DataProcessor(object):
	"""Base class for data converters for sequence classification data sets."""

	def get_example_from_tensor_dict(self, tensor_dict):
	"""Gets an example from a dict with tensorflow tensors

	Args:
	tensor_dict: Keys and values should match the corresponding Glue
	tensorflow_dataset examples.
	"""
	raise NotImplementedError()

	def get_train_examples(self, data_dir):
	"""Gets a collection of `InputExample`s for the train set."""
	raise NotImplementedError()

	def get_dev_examples(self, data_dir):
	"""Gets a collection of `InputExample`s for the dev set."""
	raise NotImplementedError()

	def get_labels(self):
	"""Gets the list of labels for this data set."""
	raise NotImplementedError()

	@classmethod
	def _read_tsv(cls, input_file, quotechar=None):
	"""Reads a tab separated value file."""
	with open(input_file, "r", encoding="utf-8-sig") as f:
	reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
	lines = []
	for line in reader:
	lines.append(line)
	return lines